diff options
author | Christoph Hellwig <hch@lst.de> | 2014-09-10 20:37:27 -0400 |
---|---|---|
committer | Trond Myklebust <trond.myklebust@primarydata.com> | 2014-09-12 13:33:50 -0400 |
commit | 5c83746a0cf2831d4b59f5cf99ef5fbf138564e4 (patch) | |
tree | febe14fb7fea5b7716fc07a2996be4253f09a663 | |
parent | 871760ce97a9a544cfb1ae4589598b25b8570a25 (diff) |
pnfs/blocklayout: in-kernel GETDEVICEINFO XDR parsing
This patches moves parsing of the GETDEVICEINFO XDR to kernel space, as well
as the management of complex devices. The reason for that is we might have
multiple outstanding complex devices after a NOTIFY_DEVICEID4_CHANGE, which
device mapper or md can't handle as they claim devices exclusively.
But as is turns out simple striping / concatenation is fairly trivial to
implement anyway, so we make our life simpler by reducing the reliance
on blkmapd. For now we still use blkmapd by feeding it synthetic SIMPLE
device XDR to translate device signatures to device numbers, but in the
long runs I have plans to eliminate it entirely.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
-rw-r--r-- | fs/nfs/blocklayout/Makefile | 2 | ||||
-rw-r--r-- | fs/nfs/blocklayout/blocklayout.c | 92 | ||||
-rw-r--r-- | fs/nfs/blocklayout/blocklayout.h | 83 | ||||
-rw-r--r-- | fs/nfs/blocklayout/dev.c | 360 | ||||
-rw-r--r-- | fs/nfs/blocklayout/rpc_pipefs.c | 141 |
5 files changed, 530 insertions, 148 deletions
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile index e177026e0119..3ca14c36d08b 100644 --- a/fs/nfs/blocklayout/Makefile +++ b/fs/nfs/blocklayout/Makefile | |||
@@ -3,4 +3,4 @@ | |||
3 | # | 3 | # |
4 | obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o | 4 | obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o |
5 | 5 | ||
6 | blocklayoutdriver-y += blocklayout.o extent_tree.o rpc_pipefs.o | 6 | blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o |
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 65a6b19b17a2..c41a718854e3 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c | |||
@@ -114,13 +114,10 @@ bl_submit_bio(int rw, struct bio *bio) | |||
114 | return NULL; | 114 | return NULL; |
115 | } | 115 | } |
116 | 116 | ||
117 | static struct bio *bl_alloc_init_bio(int npg, sector_t isect, | 117 | static struct bio * |
118 | struct pnfs_block_extent *be, | 118 | bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, |
119 | void (*end_io)(struct bio *, int err), | 119 | void (*end_io)(struct bio *, int err), struct parallel_io *par) |
120 | struct parallel_io *par) | ||
121 | { | 120 | { |
122 | struct pnfs_block_dev *dev = | ||
123 | container_of(be->be_device, struct pnfs_block_dev, d_node); | ||
124 | struct bio *bio; | 121 | struct bio *bio; |
125 | 122 | ||
126 | npg = min(npg, BIO_MAX_PAGES); | 123 | npg = min(npg, BIO_MAX_PAGES); |
@@ -131,32 +128,55 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, | |||
131 | } | 128 | } |
132 | 129 | ||
133 | if (bio) { | 130 | if (bio) { |
134 | bio->bi_iter.bi_sector = isect - be->be_f_offset + | 131 | bio->bi_iter.bi_sector = disk_sector; |
135 | be->be_v_offset; | 132 | bio->bi_bdev = bdev; |
136 | bio->bi_bdev = dev->d_bdev; | ||
137 | bio->bi_end_io = end_io; | 133 | bio->bi_end_io = end_io; |
138 | bio->bi_private = par; | 134 | bio->bi_private = par; |
139 | } | 135 | } |
140 | return bio; | 136 | return bio; |
141 | } | 137 | } |
142 | 138 | ||
143 | static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, | 139 | static struct bio * |
144 | sector_t isect, struct page *page, | 140 | do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, |
145 | struct pnfs_block_extent *be, | 141 | struct page *page, struct pnfs_block_dev_map *map, |
146 | void (*end_io)(struct bio *, int err), | 142 | struct pnfs_block_extent *be, |
147 | struct parallel_io *par, | 143 | void (*end_io)(struct bio *, int err), |
148 | unsigned int offset, int len) | 144 | struct parallel_io *par, unsigned int offset, int *len) |
149 | { | 145 | { |
150 | isect = isect + (offset >> SECTOR_SHIFT); | 146 | struct pnfs_block_dev *dev = |
147 | container_of(be->be_device, struct pnfs_block_dev, node); | ||
148 | u64 disk_addr, end; | ||
149 | |||
151 | dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, | 150 | dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, |
152 | npg, rw, (unsigned long long)isect, offset, len); | 151 | npg, rw, (unsigned long long)isect, offset, *len); |
152 | |||
153 | /* translate to device offset */ | ||
154 | isect += be->be_v_offset; | ||
155 | isect -= be->be_f_offset; | ||
156 | |||
157 | /* translate to physical disk offset */ | ||
158 | disk_addr = (u64)isect << SECTOR_SHIFT; | ||
159 | if (disk_addr < map->start || disk_addr >= map->start + map->len) { | ||
160 | if (!dev->map(dev, disk_addr, map)) | ||
161 | return ERR_PTR(-EIO); | ||
162 | bio = bl_submit_bio(rw, bio); | ||
163 | } | ||
164 | disk_addr += map->disk_offset; | ||
165 | disk_addr -= map->start; | ||
166 | |||
167 | /* limit length to what the device mapping allows */ | ||
168 | end = disk_addr + *len; | ||
169 | if (end >= map->start + map->len) | ||
170 | *len = map->start + map->len - disk_addr; | ||
171 | |||
153 | retry: | 172 | retry: |
154 | if (!bio) { | 173 | if (!bio) { |
155 | bio = bl_alloc_init_bio(npg, isect, be, end_io, par); | 174 | bio = bl_alloc_init_bio(npg, map->bdev, |
175 | disk_addr >> SECTOR_SHIFT, end_io, par); | ||
156 | if (!bio) | 176 | if (!bio) |
157 | return ERR_PTR(-ENOMEM); | 177 | return ERR_PTR(-ENOMEM); |
158 | } | 178 | } |
159 | if (bio_add_page(bio, page, len, offset) < len) { | 179 | if (bio_add_page(bio, page, *len, offset) < *len) { |
160 | bio = bl_submit_bio(rw, bio); | 180 | bio = bl_submit_bio(rw, bio); |
161 | goto retry; | 181 | goto retry; |
162 | } | 182 | } |
@@ -203,6 +223,7 @@ static enum pnfs_try_status | |||
203 | bl_read_pagelist(struct nfs_pgio_header *header) | 223 | bl_read_pagelist(struct nfs_pgio_header *header) |
204 | { | 224 | { |
205 | struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); | 225 | struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); |
226 | struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 }; | ||
206 | struct bio *bio = NULL; | 227 | struct bio *bio = NULL; |
207 | struct pnfs_block_extent be; | 228 | struct pnfs_block_extent be; |
208 | sector_t isect, extent_length = 0; | 229 | sector_t isect, extent_length = 0; |
@@ -248,28 +269,29 @@ bl_read_pagelist(struct nfs_pgio_header *header) | |||
248 | pg_len = PAGE_CACHE_SIZE - pg_offset; | 269 | pg_len = PAGE_CACHE_SIZE - pg_offset; |
249 | else | 270 | else |
250 | pg_len = bytes_left; | 271 | pg_len = bytes_left; |
251 | |||
252 | f_offset += pg_len; | ||
253 | bytes_left -= pg_len; | ||
254 | isect += (pg_offset >> SECTOR_SHIFT); | ||
255 | extent_length -= (pg_offset >> SECTOR_SHIFT); | ||
256 | } else { | 272 | } else { |
257 | BUG_ON(pg_offset != 0); | 273 | BUG_ON(pg_offset != 0); |
258 | pg_len = PAGE_CACHE_SIZE; | 274 | pg_len = PAGE_CACHE_SIZE; |
259 | } | 275 | } |
260 | 276 | ||
277 | isect += (pg_offset >> SECTOR_SHIFT); | ||
278 | extent_length -= (pg_offset >> SECTOR_SHIFT); | ||
279 | |||
261 | if (is_hole(&be)) { | 280 | if (is_hole(&be)) { |
262 | bio = bl_submit_bio(READ, bio); | 281 | bio = bl_submit_bio(READ, bio); |
263 | /* Fill hole w/ zeroes w/o accessing device */ | 282 | /* Fill hole w/ zeroes w/o accessing device */ |
264 | dprintk("%s Zeroing page for hole\n", __func__); | 283 | dprintk("%s Zeroing page for hole\n", __func__); |
265 | zero_user_segment(pages[i], pg_offset, pg_len); | 284 | zero_user_segment(pages[i], pg_offset, pg_len); |
285 | |||
286 | /* invalidate map */ | ||
287 | map.start = NFS4_MAX_UINT64; | ||
266 | } else { | 288 | } else { |
267 | bio = do_add_page_to_bio(bio, | 289 | bio = do_add_page_to_bio(bio, |
268 | header->page_array.npages - i, | 290 | header->page_array.npages - i, |
269 | READ, | 291 | READ, |
270 | isect, pages[i], &be, | 292 | isect, pages[i], &map, &be, |
271 | bl_end_io_read, par, | 293 | bl_end_io_read, par, |
272 | pg_offset, pg_len); | 294 | pg_offset, &pg_len); |
273 | if (IS_ERR(bio)) { | 295 | if (IS_ERR(bio)) { |
274 | header->pnfs_error = PTR_ERR(bio); | 296 | header->pnfs_error = PTR_ERR(bio); |
275 | bio = NULL; | 297 | bio = NULL; |
@@ -278,6 +300,8 @@ bl_read_pagelist(struct nfs_pgio_header *header) | |||
278 | } | 300 | } |
279 | isect += (pg_len >> SECTOR_SHIFT); | 301 | isect += (pg_len >> SECTOR_SHIFT); |
280 | extent_length -= (pg_len >> SECTOR_SHIFT); | 302 | extent_length -= (pg_len >> SECTOR_SHIFT); |
303 | f_offset += pg_len; | ||
304 | bytes_left -= pg_len; | ||
281 | } | 305 | } |
282 | if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { | 306 | if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { |
283 | header->res.eof = 1; | 307 | header->res.eof = 1; |
@@ -346,6 +370,7 @@ static enum pnfs_try_status | |||
346 | bl_write_pagelist(struct nfs_pgio_header *header, int sync) | 370 | bl_write_pagelist(struct nfs_pgio_header *header, int sync) |
347 | { | 371 | { |
348 | struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); | 372 | struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); |
373 | struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 }; | ||
349 | struct bio *bio = NULL; | 374 | struct bio *bio = NULL; |
350 | struct pnfs_block_extent be; | 375 | struct pnfs_block_extent be; |
351 | sector_t isect, extent_length = 0; | 376 | sector_t isect, extent_length = 0; |
@@ -354,6 +379,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) | |||
354 | size_t count = header->args.count; | 379 | size_t count = header->args.count; |
355 | struct page **pages = header->args.pages; | 380 | struct page **pages = header->args.pages; |
356 | int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; | 381 | int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; |
382 | unsigned int pg_len; | ||
357 | struct blk_plug plug; | 383 | struct blk_plug plug; |
358 | int i; | 384 | int i; |
359 | 385 | ||
@@ -387,19 +413,21 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) | |||
387 | extent_length = be.be_length - (isect - be.be_f_offset); | 413 | extent_length = be.be_length - (isect - be.be_f_offset); |
388 | } | 414 | } |
389 | 415 | ||
416 | pg_len = PAGE_CACHE_SIZE; | ||
390 | bio = do_add_page_to_bio(bio, header->page_array.npages - i, | 417 | bio = do_add_page_to_bio(bio, header->page_array.npages - i, |
391 | WRITE, isect, pages[i], &be, | 418 | WRITE, isect, pages[i], &map, &be, |
392 | bl_end_io_write, par, | 419 | bl_end_io_write, par, |
393 | 0, PAGE_CACHE_SIZE); | 420 | 0, &pg_len); |
394 | if (IS_ERR(bio)) { | 421 | if (IS_ERR(bio)) { |
395 | header->pnfs_error = PTR_ERR(bio); | 422 | header->pnfs_error = PTR_ERR(bio); |
396 | bio = NULL; | 423 | bio = NULL; |
397 | goto out; | 424 | goto out; |
398 | } | 425 | } |
399 | offset += PAGE_CACHE_SIZE; | 426 | |
400 | count -= PAGE_CACHE_SIZE; | 427 | offset += pg_len; |
401 | isect += PAGE_CACHE_SECTORS; | 428 | count -= pg_len; |
402 | extent_length -= PAGE_CACHE_SECTORS; | 429 | isect += (pg_len >> SECTOR_SHIFT); |
430 | extent_length -= (pg_len >> SECTOR_SHIFT); | ||
403 | } | 431 | } |
404 | 432 | ||
405 | header->res.count = header->args.count; | 433 | header->res.count = header->args.count; |
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index c98d98a62664..92dca9e90d8d 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h | |||
@@ -44,9 +44,77 @@ | |||
44 | #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) | 44 | #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) |
45 | #define SECTOR_SIZE (1 << SECTOR_SHIFT) | 45 | #define SECTOR_SIZE (1 << SECTOR_SHIFT) |
46 | 46 | ||
47 | struct pnfs_block_dev; | ||
48 | |||
49 | enum pnfs_block_volume_type { | ||
50 | PNFS_BLOCK_VOLUME_SIMPLE = 0, | ||
51 | PNFS_BLOCK_VOLUME_SLICE = 1, | ||
52 | PNFS_BLOCK_VOLUME_CONCAT = 2, | ||
53 | PNFS_BLOCK_VOLUME_STRIPE = 3, | ||
54 | }; | ||
55 | |||
56 | #define PNFS_BLOCK_MAX_UUIDS 4 | ||
57 | #define PNFS_BLOCK_MAX_DEVICES 64 | ||
58 | |||
59 | /* | ||
60 | * Random upper cap for the uuid length to avoid unbounded allocation. | ||
61 | * Not actually limited by the protocol. | ||
62 | */ | ||
63 | #define PNFS_BLOCK_UUID_LEN 128 | ||
64 | |||
65 | |||
66 | struct pnfs_block_volume { | ||
67 | enum pnfs_block_volume_type type; | ||
68 | union { | ||
69 | struct { | ||
70 | int len; | ||
71 | int nr_sigs; | ||
72 | struct { | ||
73 | u64 offset; | ||
74 | u32 sig_len; | ||
75 | u8 sig[PNFS_BLOCK_UUID_LEN]; | ||
76 | } sigs[PNFS_BLOCK_MAX_UUIDS]; | ||
77 | } simple; | ||
78 | struct { | ||
79 | u64 start; | ||
80 | u64 len; | ||
81 | u32 volume; | ||
82 | } slice; | ||
83 | struct { | ||
84 | u32 volumes_count; | ||
85 | u32 volumes[PNFS_BLOCK_MAX_DEVICES]; | ||
86 | } concat; | ||
87 | struct { | ||
88 | u64 chunk_size; | ||
89 | u32 volumes_count; | ||
90 | u32 volumes[PNFS_BLOCK_MAX_DEVICES]; | ||
91 | } stripe; | ||
92 | }; | ||
93 | }; | ||
94 | |||
95 | struct pnfs_block_dev_map { | ||
96 | sector_t start; | ||
97 | sector_t len; | ||
98 | |||
99 | sector_t disk_offset; | ||
100 | struct block_device *bdev; | ||
101 | }; | ||
102 | |||
47 | struct pnfs_block_dev { | 103 | struct pnfs_block_dev { |
48 | struct nfs4_deviceid_node d_node; | 104 | struct nfs4_deviceid_node node; |
49 | struct block_device *d_bdev; | 105 | |
106 | u64 start; | ||
107 | u64 len; | ||
108 | |||
109 | u32 nr_children; | ||
110 | struct pnfs_block_dev *children; | ||
111 | u64 chunk_size; | ||
112 | |||
113 | struct block_device *bdev; | ||
114 | u64 disk_offset; | ||
115 | |||
116 | bool (*map)(struct pnfs_block_dev *dev, u64 offset, | ||
117 | struct pnfs_block_dev_map *map); | ||
50 | }; | 118 | }; |
51 | 119 | ||
52 | enum exstate4 { | 120 | enum exstate4 { |
@@ -110,6 +178,11 @@ struct bl_msg_hdr { | |||
110 | #define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ | 178 | #define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ |
111 | #define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ | 179 | #define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ |
112 | 180 | ||
181 | /* dev.c */ | ||
182 | struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server, | ||
183 | struct pnfs_device *pdev, gfp_t gfp_mask); | ||
184 | void bl_free_deviceid_node(struct nfs4_deviceid_node *d); | ||
185 | |||
113 | /* extent_tree.c */ | 186 | /* extent_tree.c */ |
114 | int ext_tree_insert(struct pnfs_block_layout *bl, | 187 | int ext_tree_insert(struct pnfs_block_layout *bl, |
115 | struct pnfs_block_extent *new); | 188 | struct pnfs_block_extent *new); |
@@ -123,10 +196,8 @@ int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg); | |||
123 | void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status); | 196 | void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status); |
124 | 197 | ||
125 | /* rpc_pipefs.c */ | 198 | /* rpc_pipefs.c */ |
126 | struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server, | 199 | dev_t bl_resolve_deviceid(struct nfs_server *server, |
127 | struct pnfs_device *pdev, gfp_t gfp_mask); | 200 | struct pnfs_block_volume *b, gfp_t gfp_mask); |
128 | void bl_free_deviceid_node(struct nfs4_deviceid_node *d); | ||
129 | |||
130 | int __init bl_init_pipefs(void); | 201 | int __init bl_init_pipefs(void); |
131 | void __exit bl_cleanup_pipefs(void); | 202 | void __exit bl_cleanup_pipefs(void); |
132 | 203 | ||
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c new file mode 100644 index 000000000000..00f159da06ee --- /dev/null +++ b/fs/nfs/blocklayout/dev.c | |||
@@ -0,0 +1,360 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2014 Christoph Hellwig. | ||
3 | */ | ||
4 | #include <linux/sunrpc/svc.h> | ||
5 | #include <linux/blkdev.h> | ||
6 | #include <linux/nfs4.h> | ||
7 | #include <linux/nfs_fs.h> | ||
8 | #include <linux/nfs_xdr.h> | ||
9 | |||
10 | #include "blocklayout.h" | ||
11 | |||
12 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | ||
13 | |||
14 | static void | ||
15 | bl_free_device(struct pnfs_block_dev *dev) | ||
16 | { | ||
17 | if (dev->nr_children) { | ||
18 | int i; | ||
19 | |||
20 | for (i = 0; i < dev->nr_children; i++) | ||
21 | bl_free_device(&dev->children[i]); | ||
22 | kfree(dev->children); | ||
23 | } else { | ||
24 | if (dev->bdev) | ||
25 | blkdev_put(dev->bdev, FMODE_READ); | ||
26 | } | ||
27 | } | ||
28 | |||
29 | void | ||
30 | bl_free_deviceid_node(struct nfs4_deviceid_node *d) | ||
31 | { | ||
32 | struct pnfs_block_dev *dev = | ||
33 | container_of(d, struct pnfs_block_dev, node); | ||
34 | |||
35 | bl_free_device(dev); | ||
36 | kfree(dev); | ||
37 | } | ||
38 | |||
39 | static int | ||
40 | nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) | ||
41 | { | ||
42 | __be32 *p; | ||
43 | int i; | ||
44 | |||
45 | p = xdr_inline_decode(xdr, 4); | ||
46 | if (!p) | ||
47 | return -EIO; | ||
48 | b->type = be32_to_cpup(p++); | ||
49 | |||
50 | switch (b->type) { | ||
51 | case PNFS_BLOCK_VOLUME_SIMPLE: | ||
52 | p = xdr_inline_decode(xdr, 4); | ||
53 | if (!p) | ||
54 | return -EIO; | ||
55 | b->simple.nr_sigs = be32_to_cpup(p++); | ||
56 | if (!b->simple.nr_sigs) { | ||
57 | dprintk("no signature\n"); | ||
58 | return -EIO; | ||
59 | } | ||
60 | |||
61 | b->simple.len = 4 + 4; | ||
62 | for (i = 0; i < b->simple.nr_sigs; i++) { | ||
63 | p = xdr_inline_decode(xdr, 8 + 4); | ||
64 | if (!p) | ||
65 | return -EIO; | ||
66 | p = xdr_decode_hyper(p, &b->simple.sigs[i].offset); | ||
67 | b->simple.sigs[i].sig_len = be32_to_cpup(p++); | ||
68 | |||
69 | p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len); | ||
70 | if (!p) | ||
71 | return -EIO; | ||
72 | memcpy(&b->simple.sigs[i].sig, p, | ||
73 | b->simple.sigs[i].sig_len); | ||
74 | |||
75 | b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len; | ||
76 | } | ||
77 | break; | ||
78 | case PNFS_BLOCK_VOLUME_SLICE: | ||
79 | p = xdr_inline_decode(xdr, 8 + 8 + 4); | ||
80 | if (!p) | ||
81 | return -EIO; | ||
82 | p = xdr_decode_hyper(p, &b->slice.start); | ||
83 | p = xdr_decode_hyper(p, &b->slice.len); | ||
84 | b->slice.volume = be32_to_cpup(p++); | ||
85 | break; | ||
86 | case PNFS_BLOCK_VOLUME_CONCAT: | ||
87 | p = xdr_inline_decode(xdr, 4); | ||
88 | if (!p) | ||
89 | return -EIO; | ||
90 | b->concat.volumes_count = be32_to_cpup(p++); | ||
91 | |||
92 | p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); | ||
93 | if (!p) | ||
94 | return -EIO; | ||
95 | for (i = 0; i < b->concat.volumes_count; i++) | ||
96 | b->concat.volumes[i] = be32_to_cpup(p++); | ||
97 | break; | ||
98 | case PNFS_BLOCK_VOLUME_STRIPE: | ||
99 | p = xdr_inline_decode(xdr, 8 + 4); | ||
100 | if (!p) | ||
101 | return -EIO; | ||
102 | p = xdr_decode_hyper(p, &b->stripe.chunk_size); | ||
103 | b->stripe.volumes_count = be32_to_cpup(p++); | ||
104 | |||
105 | p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); | ||
106 | if (!p) | ||
107 | return -EIO; | ||
108 | for (i = 0; i < b->stripe.volumes_count; i++) | ||
109 | b->stripe.volumes[i] = be32_to_cpup(p++); | ||
110 | break; | ||
111 | default: | ||
112 | dprintk("unknown volume type!\n"); | ||
113 | return -EIO; | ||
114 | } | ||
115 | |||
116 | return 0; | ||
117 | } | ||
118 | |||
119 | static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset, | ||
120 | struct pnfs_block_dev_map *map) | ||
121 | { | ||
122 | map->start = dev->start; | ||
123 | map->len = dev->len; | ||
124 | map->disk_offset = dev->disk_offset; | ||
125 | map->bdev = dev->bdev; | ||
126 | return true; | ||
127 | } | ||
128 | |||
129 | static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset, | ||
130 | struct pnfs_block_dev_map *map) | ||
131 | { | ||
132 | int i; | ||
133 | |||
134 | for (i = 0; i < dev->nr_children; i++) { | ||
135 | struct pnfs_block_dev *child = &dev->children[i]; | ||
136 | |||
137 | if (child->start > offset || | ||
138 | child->start + child->len <= offset) | ||
139 | continue; | ||
140 | |||
141 | child->map(child, offset - child->start, map); | ||
142 | return true; | ||
143 | } | ||
144 | |||
145 | dprintk("%s: ran off loop!\n", __func__); | ||
146 | return false; | ||
147 | } | ||
148 | |||
149 | static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset, | ||
150 | struct pnfs_block_dev_map *map) | ||
151 | { | ||
152 | struct pnfs_block_dev *child; | ||
153 | u64 chunk = (offset / dev->chunk_size); | ||
154 | int chunk_idx = chunk % dev->nr_children; | ||
155 | u64 disk_offset; | ||
156 | |||
157 | if (chunk_idx > dev->nr_children) { | ||
158 | dprintk("%s: invalid chunk idx %d (%lld/%lld)\n", | ||
159 | __func__, chunk_idx, offset, dev->chunk_size); | ||
160 | /* error, should not happen */ | ||
161 | return false; | ||
162 | } | ||
163 | |||
164 | /* truncate offset to the beginning of the stripe */ | ||
165 | offset = chunk * dev->chunk_size; | ||
166 | |||
167 | /* disk offset of the stripe */ | ||
168 | disk_offset = offset / dev->nr_children; | ||
169 | |||
170 | child = &dev->children[chunk_idx]; | ||
171 | child->map(child, disk_offset, map); | ||
172 | |||
173 | map->start += offset; | ||
174 | map->disk_offset += disk_offset; | ||
175 | map->len = dev->chunk_size; | ||
176 | return true; | ||
177 | } | ||
178 | |||
179 | static int | ||
180 | bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, | ||
181 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask); | ||
182 | |||
183 | |||
184 | static int | ||
185 | bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, | ||
186 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) | ||
187 | { | ||
188 | struct pnfs_block_volume *v = &volumes[idx]; | ||
189 | dev_t dev; | ||
190 | |||
191 | dev = bl_resolve_deviceid(server, v, gfp_mask); | ||
192 | if (!dev) | ||
193 | return -EIO; | ||
194 | |||
195 | d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); | ||
196 | if (IS_ERR(d->bdev)) { | ||
197 | printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", | ||
198 | MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); | ||
199 | return PTR_ERR(d->bdev); | ||
200 | } | ||
201 | |||
202 | |||
203 | d->len = i_size_read(d->bdev->bd_inode); | ||
204 | d->map = bl_map_simple; | ||
205 | |||
206 | printk(KERN_INFO "pNFS: using block device %s\n", | ||
207 | d->bdev->bd_disk->disk_name); | ||
208 | return 0; | ||
209 | } | ||
210 | |||
211 | static int | ||
212 | bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d, | ||
213 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) | ||
214 | { | ||
215 | struct pnfs_block_volume *v = &volumes[idx]; | ||
216 | int ret; | ||
217 | |||
218 | ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask); | ||
219 | if (ret) | ||
220 | return ret; | ||
221 | |||
222 | d->disk_offset = v->slice.start; | ||
223 | d->len = v->slice.len; | ||
224 | return 0; | ||
225 | } | ||
226 | |||
227 | static int | ||
228 | bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d, | ||
229 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) | ||
230 | { | ||
231 | struct pnfs_block_volume *v = &volumes[idx]; | ||
232 | u64 len = 0; | ||
233 | int ret, i; | ||
234 | |||
235 | d->children = kcalloc(v->concat.volumes_count, | ||
236 | sizeof(struct pnfs_block_dev), GFP_KERNEL); | ||
237 | if (!d->children) | ||
238 | return -ENOMEM; | ||
239 | |||
240 | for (i = 0; i < v->concat.volumes_count; i++) { | ||
241 | ret = bl_parse_deviceid(server, &d->children[i], | ||
242 | volumes, v->concat.volumes[i], gfp_mask); | ||
243 | if (ret) | ||
244 | return ret; | ||
245 | |||
246 | d->nr_children++; | ||
247 | d->children[i].start += len; | ||
248 | len += d->children[i].len; | ||
249 | } | ||
250 | |||
251 | d->len = len; | ||
252 | d->map = bl_map_concat; | ||
253 | return 0; | ||
254 | } | ||
255 | |||
256 | static int | ||
257 | bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d, | ||
258 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) | ||
259 | { | ||
260 | struct pnfs_block_volume *v = &volumes[idx]; | ||
261 | u64 len = 0; | ||
262 | int ret, i; | ||
263 | |||
264 | d->children = kcalloc(v->stripe.volumes_count, | ||
265 | sizeof(struct pnfs_block_dev), GFP_KERNEL); | ||
266 | if (!d->children) | ||
267 | return -ENOMEM; | ||
268 | |||
269 | for (i = 0; i < v->stripe.volumes_count; i++) { | ||
270 | ret = bl_parse_deviceid(server, &d->children[i], | ||
271 | volumes, v->stripe.volumes[i], gfp_mask); | ||
272 | if (ret) | ||
273 | return ret; | ||
274 | |||
275 | d->nr_children++; | ||
276 | len += d->children[i].len; | ||
277 | } | ||
278 | |||
279 | d->len = len; | ||
280 | d->chunk_size = v->stripe.chunk_size; | ||
281 | d->map = bl_map_stripe; | ||
282 | return 0; | ||
283 | } | ||
284 | |||
285 | static int | ||
286 | bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, | ||
287 | struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) | ||
288 | { | ||
289 | switch (volumes[idx].type) { | ||
290 | case PNFS_BLOCK_VOLUME_SIMPLE: | ||
291 | return bl_parse_simple(server, d, volumes, idx, gfp_mask); | ||
292 | case PNFS_BLOCK_VOLUME_SLICE: | ||
293 | return bl_parse_slice(server, d, volumes, idx, gfp_mask); | ||
294 | case PNFS_BLOCK_VOLUME_CONCAT: | ||
295 | return bl_parse_concat(server, d, volumes, idx, gfp_mask); | ||
296 | case PNFS_BLOCK_VOLUME_STRIPE: | ||
297 | return bl_parse_stripe(server, d, volumes, idx, gfp_mask); | ||
298 | default: | ||
299 | dprintk("unsupported volume type: %d\n", volumes[idx].type); | ||
300 | return -EIO; | ||
301 | } | ||
302 | } | ||
303 | |||
304 | struct nfs4_deviceid_node * | ||
305 | bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, | ||
306 | gfp_t gfp_mask) | ||
307 | { | ||
308 | struct nfs4_deviceid_node *node = NULL; | ||
309 | struct pnfs_block_volume *volumes; | ||
310 | struct pnfs_block_dev *top; | ||
311 | struct xdr_stream xdr; | ||
312 | struct xdr_buf buf; | ||
313 | struct page *scratch; | ||
314 | int nr_volumes, ret, i; | ||
315 | __be32 *p; | ||
316 | |||
317 | scratch = alloc_page(gfp_mask); | ||
318 | if (!scratch) | ||
319 | goto out; | ||
320 | |||
321 | xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen); | ||
322 | xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); | ||
323 | |||
324 | p = xdr_inline_decode(&xdr, sizeof(__be32)); | ||
325 | if (!p) | ||
326 | goto out_free_scratch; | ||
327 | nr_volumes = be32_to_cpup(p++); | ||
328 | |||
329 | volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume), | ||
330 | gfp_mask); | ||
331 | if (!volumes) | ||
332 | goto out_free_scratch; | ||
333 | |||
334 | for (i = 0; i < nr_volumes; i++) { | ||
335 | ret = nfs4_block_decode_volume(&xdr, &volumes[i]); | ||
336 | if (ret < 0) | ||
337 | goto out_free_volumes; | ||
338 | } | ||
339 | |||
340 | top = kzalloc(sizeof(*top), gfp_mask); | ||
341 | if (!top) | ||
342 | goto out_free_volumes; | ||
343 | |||
344 | ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask); | ||
345 | if (ret) { | ||
346 | bl_free_device(top); | ||
347 | kfree(top); | ||
348 | goto out_free_volumes; | ||
349 | } | ||
350 | |||
351 | node = &top->node; | ||
352 | nfs4_init_deviceid_node(node, server, &pdev->dev_id); | ||
353 | |||
354 | out_free_volumes: | ||
355 | kfree(volumes); | ||
356 | out_free_scratch: | ||
357 | __free_page(scratch); | ||
358 | out: | ||
359 | return node; | ||
360 | } | ||
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index bfb04861eb61..8d04bda2bd2e 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c | |||
@@ -34,94 +34,53 @@ | |||
34 | 34 | ||
35 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | 35 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD |
36 | 36 | ||
37 | static void bl_dm_remove(struct net *net, dev_t dev) | 37 | static void |
38 | nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b) | ||
38 | { | 39 | { |
39 | struct bl_pipe_msg bl_pipe_msg; | 40 | int i; |
40 | struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; | 41 | |
41 | struct bl_dev_msg bl_umount_request; | 42 | *p++ = cpu_to_be32(1); |
42 | struct bl_msg_hdr bl_msg = { | 43 | *p++ = cpu_to_be32(b->type); |
43 | .type = BL_DEVICE_UMOUNT, | 44 | *p++ = cpu_to_be32(b->simple.nr_sigs); |
44 | .totallen = sizeof(bl_umount_request), | 45 | for (i = 0; i < b->simple.nr_sigs; i++) { |
45 | }; | 46 | p = xdr_encode_hyper(p, b->simple.sigs[i].offset); |
46 | uint8_t *dataptr; | 47 | p = xdr_encode_opaque(p, b->simple.sigs[i].sig, |
47 | DECLARE_WAITQUEUE(wq, current); | 48 | b->simple.sigs[i].sig_len); |
48 | struct nfs_net *nn = net_generic(net, nfs_net_id); | ||
49 | |||
50 | dprintk("Entering %s\n", __func__); | ||
51 | |||
52 | bl_pipe_msg.bl_wq = &nn->bl_wq; | ||
53 | memset(msg, 0, sizeof(*msg)); | ||
54 | msg->len = sizeof(bl_msg) + bl_msg.totallen; | ||
55 | msg->data = kzalloc(msg->len, GFP_NOFS); | ||
56 | if (!msg->data) | ||
57 | goto out; | ||
58 | |||
59 | memset(&bl_umount_request, 0, sizeof(bl_umount_request)); | ||
60 | bl_umount_request.major = MAJOR(dev); | ||
61 | bl_umount_request.minor = MINOR(dev); | ||
62 | |||
63 | memcpy(msg->data, &bl_msg, sizeof(bl_msg)); | ||
64 | dataptr = (uint8_t *) msg->data; | ||
65 | memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); | ||
66 | |||
67 | add_wait_queue(&nn->bl_wq, &wq); | ||
68 | if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { | ||
69 | remove_wait_queue(&nn->bl_wq, &wq); | ||
70 | goto out; | ||
71 | } | 49 | } |
72 | |||
73 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
74 | schedule(); | ||
75 | __set_current_state(TASK_RUNNING); | ||
76 | remove_wait_queue(&nn->bl_wq, &wq); | ||
77 | |||
78 | out: | ||
79 | kfree(msg->data); | ||
80 | } | 50 | } |
81 | 51 | ||
82 | /* | 52 | dev_t |
83 | * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. | 53 | bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, |
84 | */ | ||
85 | struct nfs4_deviceid_node * | ||
86 | bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev, | ||
87 | gfp_t gfp_mask) | 54 | gfp_t gfp_mask) |
88 | { | 55 | { |
89 | struct pnfs_block_dev *rv; | ||
90 | struct block_device *bd; | ||
91 | struct bl_pipe_msg bl_pipe_msg; | ||
92 | struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; | ||
93 | struct bl_msg_hdr bl_msg = { | ||
94 | .type = BL_DEVICE_MOUNT, | ||
95 | .totallen = dev->mincount, | ||
96 | }; | ||
97 | uint8_t *dataptr; | ||
98 | DECLARE_WAITQUEUE(wq, current); | ||
99 | int offset, len, i, rc; | ||
100 | struct net *net = server->nfs_client->cl_net; | 56 | struct net *net = server->nfs_client->cl_net; |
101 | struct nfs_net *nn = net_generic(net, nfs_net_id); | 57 | struct nfs_net *nn = net_generic(net, nfs_net_id); |
102 | struct bl_dev_msg *reply = &nn->bl_mount_reply; | 58 | struct bl_dev_msg *reply = &nn->bl_mount_reply; |
59 | struct bl_pipe_msg bl_pipe_msg; | ||
60 | struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; | ||
61 | struct bl_msg_hdr *bl_msg; | ||
62 | DECLARE_WAITQUEUE(wq, current); | ||
63 | dev_t dev = 0; | ||
64 | int rc; | ||
103 | 65 | ||
104 | dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); | 66 | dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); |
105 | dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, | ||
106 | dev->mincount); | ||
107 | 67 | ||
108 | bl_pipe_msg.bl_wq = &nn->bl_wq; | 68 | bl_pipe_msg.bl_wq = &nn->bl_wq; |
69 | |||
70 | b->simple.len += 4; /* single volume */ | ||
71 | if (b->simple.len > PAGE_SIZE) | ||
72 | return -EIO; | ||
73 | |||
109 | memset(msg, 0, sizeof(*msg)); | 74 | memset(msg, 0, sizeof(*msg)); |
110 | msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, gfp_mask); | 75 | msg->len = sizeof(*bl_msg) + b->simple.len; |
76 | msg->data = kzalloc(msg->len, gfp_mask); | ||
111 | if (!msg->data) | 77 | if (!msg->data) |
112 | goto out; | 78 | goto out; |
113 | 79 | ||
114 | memcpy(msg->data, &bl_msg, sizeof(bl_msg)); | 80 | bl_msg = msg->data; |
115 | dataptr = (uint8_t *) msg->data; | 81 | bl_msg->type = BL_DEVICE_MOUNT, |
116 | len = dev->mincount; | 82 | bl_msg->totallen = b->simple.len; |
117 | offset = sizeof(bl_msg); | 83 | nfs4_encode_simple(msg->data + sizeof(*bl_msg), b); |
118 | for (i = 0; len > 0; i++) { | ||
119 | memcpy(&dataptr[offset], page_address(dev->pages[i]), | ||
120 | len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); | ||
121 | len -= PAGE_CACHE_SIZE; | ||
122 | offset += PAGE_CACHE_SIZE; | ||
123 | } | ||
124 | msg->len = sizeof(bl_msg) + dev->mincount; | ||
125 | 84 | ||
126 | dprintk("%s CALLING USERSPACE DAEMON\n", __func__); | 85 | dprintk("%s CALLING USERSPACE DAEMON\n", __func__); |
127 | add_wait_queue(&nn->bl_wq, &wq); | 86 | add_wait_queue(&nn->bl_wq, &wq); |
@@ -142,46 +101,10 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev, | |||
142 | goto out; | 101 | goto out; |
143 | } | 102 | } |
144 | 103 | ||
145 | bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor), | 104 | dev = MKDEV(reply->major, reply->minor); |
146 | FMODE_READ, NULL); | ||
147 | if (IS_ERR(bd)) { | ||
148 | printk(KERN_WARNING "%s failed to open device %d:%d (%ld)\n", | ||
149 | __func__, reply->major, reply->minor, | ||
150 | PTR_ERR(bd)); | ||
151 | goto out; | ||
152 | } | ||
153 | |||
154 | rv = kzalloc(sizeof(*rv), gfp_mask); | ||
155 | if (!rv) | ||
156 | goto out; | ||
157 | |||
158 | nfs4_init_deviceid_node(&rv->d_node, server, &dev->dev_id); | ||
159 | rv->d_bdev = bd; | ||
160 | |||
161 | dprintk("%s Created device %s with bd_block_size %u\n", | ||
162 | __func__, | ||
163 | bd->bd_disk->disk_name, | ||
164 | bd->bd_block_size); | ||
165 | |||
166 | kfree(msg->data); | ||
167 | return &rv->d_node; | ||
168 | |||
169 | out: | 105 | out: |
170 | kfree(msg->data); | 106 | kfree(msg->data); |
171 | return NULL; | 107 | return dev; |
172 | } | ||
173 | |||
174 | void | ||
175 | bl_free_deviceid_node(struct nfs4_deviceid_node *d) | ||
176 | { | ||
177 | struct pnfs_block_dev *dev = | ||
178 | container_of(d, struct pnfs_block_dev, d_node); | ||
179 | struct net *net = d->nfs_client->cl_net; | ||
180 | |||
181 | blkdev_put(dev->d_bdev, FMODE_READ); | ||
182 | bl_dm_remove(net, dev->d_bdev->bd_dev); | ||
183 | |||
184 | kfree(dev); | ||
185 | } | 108 | } |
186 | 109 | ||
187 | static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, | 110 | static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, |