diff options
Diffstat (limited to 'fs/nfs/blocklayout/blocklayoutdev.c')
-rw-r--r-- | fs/nfs/blocklayout/blocklayoutdev.c | 410 |
1 files changed, 410 insertions, 0 deletions
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c new file mode 100644 index 000000000000..a83b393fb01c --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdev.c | |||
@@ -0,0 +1,410 @@ | |||
1 | /* | ||
2 | * linux/fs/nfs/blocklayout/blocklayoutdev.c | ||
3 | * | ||
4 | * Device operations for the pnfs nfs4 file layout driver. | ||
5 | * | ||
6 | * Copyright (c) 2006 The Regents of the University of Michigan. | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * Andy Adamson <andros@citi.umich.edu> | ||
10 | * Fred Isaman <iisaman@umich.edu> | ||
11 | * | ||
12 | * permission is granted to use, copy, create derivative works and | ||
13 | * redistribute this software and such derivative works for any purpose, | ||
14 | * so long as the name of the university of michigan is not used in | ||
15 | * any advertising or publicity pertaining to the use or distribution | ||
16 | * of this software without specific, written prior authorization. if | ||
17 | * the above copyright notice or any other identification of the | ||
18 | * university of michigan is included in any copy of any portion of | ||
19 | * this software, then the disclaimer below must also be included. | ||
20 | * | ||
21 | * this software is provided as is, without representation from the | ||
22 | * university of michigan as to its fitness for any purpose, and without | ||
23 | * warranty by the university of michigan of any kind, either express | ||
24 | * or implied, including without limitation the implied warranties of | ||
25 | * merchantability and fitness for a particular purpose. the regents | ||
26 | * of the university of michigan shall not be liable for any damages, | ||
27 | * including special, indirect, incidental, or consequential damages, | ||
28 | * with respect to any claim arising out or in connection with the use | ||
29 | * of the software, even if it has been or is hereafter advised of the | ||
30 | * possibility of such damages. | ||
31 | */ | ||
32 | #include <linux/module.h> | ||
33 | #include <linux/buffer_head.h> /* __bread */ | ||
34 | |||
35 | #include <linux/genhd.h> | ||
36 | #include <linux/blkdev.h> | ||
37 | #include <linux/hash.h> | ||
38 | |||
39 | #include "blocklayout.h" | ||
40 | |||
41 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | ||
42 | |||
43 | static int decode_sector_number(__be32 **rp, sector_t *sp) | ||
44 | { | ||
45 | uint64_t s; | ||
46 | |||
47 | *rp = xdr_decode_hyper(*rp, &s); | ||
48 | if (s & 0x1ff) { | ||
49 | printk(KERN_WARNING "%s: sector not aligned\n", __func__); | ||
50 | return -1; | ||
51 | } | ||
52 | *sp = s >> SECTOR_SHIFT; | ||
53 | return 0; | ||
54 | } | ||
55 | |||
56 | /* Open a block_device by device number. */ | ||
57 | struct block_device *nfs4_blkdev_get(dev_t dev) | ||
58 | { | ||
59 | struct block_device *bd; | ||
60 | |||
61 | dprintk("%s enter\n", __func__); | ||
62 | bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); | ||
63 | if (IS_ERR(bd)) | ||
64 | goto fail; | ||
65 | return bd; | ||
66 | fail: | ||
67 | dprintk("%s failed to open device : %ld\n", | ||
68 | __func__, PTR_ERR(bd)); | ||
69 | return NULL; | ||
70 | } | ||
71 | |||
72 | /* | ||
73 | * Release the block device | ||
74 | */ | ||
75 | int nfs4_blkdev_put(struct block_device *bdev) | ||
76 | { | ||
77 | dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), | ||
78 | MINOR(bdev->bd_dev)); | ||
79 | return blkdev_put(bdev, FMODE_READ); | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Shouldn't there be a rpc_generic_upcall() to do this for us? | ||
84 | */ | ||
85 | ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, | ||
86 | char __user *dst, size_t buflen) | ||
87 | { | ||
88 | char *data = (char *)msg->data + msg->copied; | ||
89 | size_t mlen = min(msg->len - msg->copied, buflen); | ||
90 | unsigned long left; | ||
91 | |||
92 | left = copy_to_user(dst, data, mlen); | ||
93 | if (left == mlen) { | ||
94 | msg->errno = -EFAULT; | ||
95 | return -EFAULT; | ||
96 | } | ||
97 | |||
98 | mlen -= left; | ||
99 | msg->copied += mlen; | ||
100 | msg->errno = 0; | ||
101 | return mlen; | ||
102 | } | ||
103 | |||
104 | static struct bl_dev_msg bl_mount_reply; | ||
105 | |||
106 | ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, | ||
107 | size_t mlen) | ||
108 | { | ||
109 | if (mlen != sizeof (struct bl_dev_msg)) | ||
110 | return -EINVAL; | ||
111 | |||
112 | if (copy_from_user(&bl_mount_reply, src, mlen) != 0) | ||
113 | return -EFAULT; | ||
114 | |||
115 | wake_up(&bl_wq); | ||
116 | |||
117 | return mlen; | ||
118 | } | ||
119 | |||
120 | void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) | ||
121 | { | ||
122 | if (msg->errno >= 0) | ||
123 | return; | ||
124 | wake_up(&bl_wq); | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. | ||
129 | */ | ||
130 | struct pnfs_block_dev * | ||
131 | nfs4_blk_decode_device(struct nfs_server *server, | ||
132 | struct pnfs_device *dev) | ||
133 | { | ||
134 | struct pnfs_block_dev *rv = NULL; | ||
135 | struct block_device *bd = NULL; | ||
136 | struct rpc_pipe_msg msg; | ||
137 | struct bl_msg_hdr bl_msg = { | ||
138 | .type = BL_DEVICE_MOUNT, | ||
139 | .totallen = dev->mincount, | ||
140 | }; | ||
141 | uint8_t *dataptr; | ||
142 | DECLARE_WAITQUEUE(wq, current); | ||
143 | struct bl_dev_msg *reply = &bl_mount_reply; | ||
144 | int offset, len, i; | ||
145 | |||
146 | dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); | ||
147 | dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, | ||
148 | dev->mincount); | ||
149 | |||
150 | memset(&msg, 0, sizeof(msg)); | ||
151 | msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); | ||
152 | if (!msg.data) { | ||
153 | rv = ERR_PTR(-ENOMEM); | ||
154 | goto out; | ||
155 | } | ||
156 | |||
157 | memcpy(msg.data, &bl_msg, sizeof(bl_msg)); | ||
158 | dataptr = (uint8_t *) msg.data; | ||
159 | len = dev->mincount; | ||
160 | offset = sizeof(bl_msg); | ||
161 | for (i = 0; len > 0; i++) { | ||
162 | memcpy(&dataptr[offset], page_address(dev->pages[i]), | ||
163 | len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); | ||
164 | len -= PAGE_CACHE_SIZE; | ||
165 | offset += PAGE_CACHE_SIZE; | ||
166 | } | ||
167 | msg.len = sizeof(bl_msg) + dev->mincount; | ||
168 | |||
169 | dprintk("%s CALLING USERSPACE DAEMON\n", __func__); | ||
170 | add_wait_queue(&bl_wq, &wq); | ||
171 | if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { | ||
172 | remove_wait_queue(&bl_wq, &wq); | ||
173 | goto out; | ||
174 | } | ||
175 | |||
176 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
177 | schedule(); | ||
178 | __set_current_state(TASK_RUNNING); | ||
179 | remove_wait_queue(&bl_wq, &wq); | ||
180 | |||
181 | if (reply->status != BL_DEVICE_REQUEST_PROC) { | ||
182 | dprintk("%s failed to open device: %d\n", | ||
183 | __func__, reply->status); | ||
184 | rv = ERR_PTR(-EINVAL); | ||
185 | goto out; | ||
186 | } | ||
187 | |||
188 | bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); | ||
189 | if (IS_ERR(bd)) { | ||
190 | dprintk("%s failed to open device : %ld\n", | ||
191 | __func__, PTR_ERR(bd)); | ||
192 | goto out; | ||
193 | } | ||
194 | |||
195 | rv = kzalloc(sizeof(*rv), GFP_NOFS); | ||
196 | if (!rv) { | ||
197 | rv = ERR_PTR(-ENOMEM); | ||
198 | goto out; | ||
199 | } | ||
200 | |||
201 | rv->bm_mdev = bd; | ||
202 | memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); | ||
203 | dprintk("%s Created device %s with bd_block_size %u\n", | ||
204 | __func__, | ||
205 | bd->bd_disk->disk_name, | ||
206 | bd->bd_block_size); | ||
207 | |||
208 | out: | ||
209 | kfree(msg.data); | ||
210 | return rv; | ||
211 | } | ||
212 | |||
213 | /* Map deviceid returned by the server to constructed block_device */ | ||
214 | static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, | ||
215 | struct nfs4_deviceid *id) | ||
216 | { | ||
217 | struct block_device *rv = NULL; | ||
218 | struct block_mount_id *mid; | ||
219 | struct pnfs_block_dev *dev; | ||
220 | |||
221 | dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); | ||
222 | mid = BLK_ID(lo); | ||
223 | spin_lock(&mid->bm_lock); | ||
224 | list_for_each_entry(dev, &mid->bm_devlist, bm_node) { | ||
225 | if (memcmp(id->data, dev->bm_mdevid.data, | ||
226 | NFS4_DEVICEID4_SIZE) == 0) { | ||
227 | rv = dev->bm_mdev; | ||
228 | goto out; | ||
229 | } | ||
230 | } | ||
231 | out: | ||
232 | spin_unlock(&mid->bm_lock); | ||
233 | dprintk("%s returning %p\n", __func__, rv); | ||
234 | return rv; | ||
235 | } | ||
236 | |||
237 | /* Tracks info needed to ensure extents in layout obey constraints of spec */ | ||
238 | struct layout_verification { | ||
239 | u32 mode; /* R or RW */ | ||
240 | u64 start; /* Expected start of next non-COW extent */ | ||
241 | u64 inval; /* Start of INVAL coverage */ | ||
242 | u64 cowread; /* End of COW read coverage */ | ||
243 | }; | ||
244 | |||
245 | /* Verify the extent meets the layout requirements of the pnfs-block draft, | ||
246 | * section 2.3.1. | ||
247 | */ | ||
248 | static int verify_extent(struct pnfs_block_extent *be, | ||
249 | struct layout_verification *lv) | ||
250 | { | ||
251 | if (lv->mode == IOMODE_READ) { | ||
252 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA || | ||
253 | be->be_state == PNFS_BLOCK_INVALID_DATA) | ||
254 | return -EIO; | ||
255 | if (be->be_f_offset != lv->start) | ||
256 | return -EIO; | ||
257 | lv->start += be->be_length; | ||
258 | return 0; | ||
259 | } | ||
260 | /* lv->mode == IOMODE_RW */ | ||
261 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { | ||
262 | if (be->be_f_offset != lv->start) | ||
263 | return -EIO; | ||
264 | if (lv->cowread > lv->start) | ||
265 | return -EIO; | ||
266 | lv->start += be->be_length; | ||
267 | lv->inval = lv->start; | ||
268 | return 0; | ||
269 | } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | ||
270 | if (be->be_f_offset != lv->start) | ||
271 | return -EIO; | ||
272 | lv->start += be->be_length; | ||
273 | return 0; | ||
274 | } else if (be->be_state == PNFS_BLOCK_READ_DATA) { | ||
275 | if (be->be_f_offset > lv->start) | ||
276 | return -EIO; | ||
277 | if (be->be_f_offset < lv->inval) | ||
278 | return -EIO; | ||
279 | if (be->be_f_offset < lv->cowread) | ||
280 | return -EIO; | ||
281 | /* It looks like you might want to min this with lv->start, | ||
282 | * but you really don't. | ||
283 | */ | ||
284 | lv->inval = lv->inval + be->be_length; | ||
285 | lv->cowread = be->be_f_offset + be->be_length; | ||
286 | return 0; | ||
287 | } else | ||
288 | return -EIO; | ||
289 | } | ||
290 | |||
291 | /* XDR decode pnfs_block_layout4 structure */ | ||
292 | int | ||
293 | nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, | ||
294 | struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) | ||
295 | { | ||
296 | struct pnfs_block_layout *bl = BLK_LO2EXT(lo); | ||
297 | int i, status = -EIO; | ||
298 | uint32_t count; | ||
299 | struct pnfs_block_extent *be = NULL, *save; | ||
300 | struct xdr_stream stream; | ||
301 | struct xdr_buf buf; | ||
302 | struct page *scratch; | ||
303 | __be32 *p; | ||
304 | struct layout_verification lv = { | ||
305 | .mode = lgr->range.iomode, | ||
306 | .start = lgr->range.offset >> SECTOR_SHIFT, | ||
307 | .inval = lgr->range.offset >> SECTOR_SHIFT, | ||
308 | .cowread = lgr->range.offset >> SECTOR_SHIFT, | ||
309 | }; | ||
310 | LIST_HEAD(extents); | ||
311 | |||
312 | dprintk("---> %s\n", __func__); | ||
313 | |||
314 | scratch = alloc_page(gfp_flags); | ||
315 | if (!scratch) | ||
316 | return -ENOMEM; | ||
317 | |||
318 | xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); | ||
319 | xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); | ||
320 | |||
321 | p = xdr_inline_decode(&stream, 4); | ||
322 | if (unlikely(!p)) | ||
323 | goto out_err; | ||
324 | |||
325 | count = be32_to_cpup(p++); | ||
326 | |||
327 | dprintk("%s enter, number of extents %i\n", __func__, count); | ||
328 | p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); | ||
329 | if (unlikely(!p)) | ||
330 | goto out_err; | ||
331 | |||
332 | /* Decode individual extents, putting them in temporary | ||
333 | * staging area until whole layout is decoded to make error | ||
334 | * recovery easier. | ||
335 | */ | ||
336 | for (i = 0; i < count; i++) { | ||
337 | be = bl_alloc_extent(); | ||
338 | if (!be) { | ||
339 | status = -ENOMEM; | ||
340 | goto out_err; | ||
341 | } | ||
342 | memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); | ||
343 | p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); | ||
344 | be->be_mdev = translate_devid(lo, &be->be_devid); | ||
345 | if (!be->be_mdev) | ||
346 | goto out_err; | ||
347 | |||
348 | /* The next three values are read in as bytes, | ||
349 | * but stored as 512-byte sector lengths | ||
350 | */ | ||
351 | if (decode_sector_number(&p, &be->be_f_offset) < 0) | ||
352 | goto out_err; | ||
353 | if (decode_sector_number(&p, &be->be_length) < 0) | ||
354 | goto out_err; | ||
355 | if (decode_sector_number(&p, &be->be_v_offset) < 0) | ||
356 | goto out_err; | ||
357 | be->be_state = be32_to_cpup(p++); | ||
358 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) | ||
359 | be->be_inval = &bl->bl_inval; | ||
360 | if (verify_extent(be, &lv)) { | ||
361 | dprintk("%s verify failed\n", __func__); | ||
362 | goto out_err; | ||
363 | } | ||
364 | list_add_tail(&be->be_node, &extents); | ||
365 | } | ||
366 | if (lgr->range.offset + lgr->range.length != | ||
367 | lv.start << SECTOR_SHIFT) { | ||
368 | dprintk("%s Final length mismatch\n", __func__); | ||
369 | be = NULL; | ||
370 | goto out_err; | ||
371 | } | ||
372 | if (lv.start < lv.cowread) { | ||
373 | dprintk("%s Final uncovered COW extent\n", __func__); | ||
374 | be = NULL; | ||
375 | goto out_err; | ||
376 | } | ||
377 | /* Extents decoded properly, now try to merge them in to | ||
378 | * existing layout extents. | ||
379 | */ | ||
380 | spin_lock(&bl->bl_ext_lock); | ||
381 | list_for_each_entry_safe(be, save, &extents, be_node) { | ||
382 | list_del(&be->be_node); | ||
383 | status = bl_add_merge_extent(bl, be); | ||
384 | if (status) { | ||
385 | spin_unlock(&bl->bl_ext_lock); | ||
386 | /* This is a fairly catastrophic error, as the | ||
387 | * entire layout extent lists are now corrupted. | ||
388 | * We should have some way to distinguish this. | ||
389 | */ | ||
390 | be = NULL; | ||
391 | goto out_err; | ||
392 | } | ||
393 | } | ||
394 | spin_unlock(&bl->bl_ext_lock); | ||
395 | status = 0; | ||
396 | out: | ||
397 | __free_page(scratch); | ||
398 | dprintk("%s returns %i\n", __func__, status); | ||
399 | return status; | ||
400 | |||
401 | out_err: | ||
402 | bl_put_extent(be); | ||
403 | while (!list_empty(&extents)) { | ||
404 | be = list_first_entry(&extents, struct pnfs_block_extent, | ||
405 | be_node); | ||
406 | list_del(&be->be_node); | ||
407 | bl_put_extent(be); | ||
408 | } | ||
409 | goto out; | ||
410 | } | ||