diff options
Diffstat (limited to 'fs/nfs/blocklayout/blocklayoutdev.c')
-rw-r--r-- | fs/nfs/blocklayout/blocklayoutdev.c | 391 |
1 files changed, 391 insertions, 0 deletions
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c new file mode 100644 index 00000000000..d08ba9107fd --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdev.c | |||
@@ -0,0 +1,391 @@ | |||
1 | /* | ||
2 | * linux/fs/nfs/blocklayout/blocklayoutdev.c | ||
3 | * | ||
4 | * Device operations for the pnfs nfs4 file layout driver. | ||
5 | * | ||
6 | * Copyright (c) 2006 The Regents of the University of Michigan. | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * Andy Adamson <andros@citi.umich.edu> | ||
10 | * Fred Isaman <iisaman@umich.edu> | ||
11 | * | ||
12 | * permission is granted to use, copy, create derivative works and | ||
13 | * redistribute this software and such derivative works for any purpose, | ||
14 | * so long as the name of the university of michigan is not used in | ||
15 | * any advertising or publicity pertaining to the use or distribution | ||
16 | * of this software without specific, written prior authorization. if | ||
17 | * the above copyright notice or any other identification of the | ||
18 | * university of michigan is included in any copy of any portion of | ||
19 | * this software, then the disclaimer below must also be included. | ||
20 | * | ||
21 | * this software is provided as is, without representation from the | ||
22 | * university of michigan as to its fitness for any purpose, and without | ||
23 | * warranty by the university of michigan of any kind, either express | ||
24 | * or implied, including without limitation the implied warranties of | ||
25 | * merchantability and fitness for a particular purpose. the regents | ||
26 | * of the university of michigan shall not be liable for any damages, | ||
27 | * including special, indirect, incidental, or consequential damages, | ||
28 | * with respect to any claim arising out or in connection with the use | ||
29 | * of the software, even if it has been or is hereafter advised of the | ||
30 | * possibility of such damages. | ||
31 | */ | ||
32 | #include <linux/module.h> | ||
33 | #include <linux/buffer_head.h> /* __bread */ | ||
34 | |||
35 | #include <linux/genhd.h> | ||
36 | #include <linux/blkdev.h> | ||
37 | #include <linux/hash.h> | ||
38 | |||
39 | #include "blocklayout.h" | ||
40 | |||
41 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | ||
42 | |||
43 | static int decode_sector_number(__be32 **rp, sector_t *sp) | ||
44 | { | ||
45 | uint64_t s; | ||
46 | |||
47 | *rp = xdr_decode_hyper(*rp, &s); | ||
48 | if (s & 0x1ff) { | ||
49 | printk(KERN_WARNING "%s: sector not aligned\n", __func__); | ||
50 | return -1; | ||
51 | } | ||
52 | *sp = s >> SECTOR_SHIFT; | ||
53 | return 0; | ||
54 | } | ||
55 | |||
56 | /* Open a block_device by device number. */ | ||
57 | struct block_device *nfs4_blkdev_get(dev_t dev) | ||
58 | { | ||
59 | struct block_device *bd; | ||
60 | |||
61 | dprintk("%s enter\n", __func__); | ||
62 | bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); | ||
63 | if (IS_ERR(bd)) | ||
64 | goto fail; | ||
65 | return bd; | ||
66 | fail: | ||
67 | dprintk("%s failed to open device : %ld\n", | ||
68 | __func__, PTR_ERR(bd)); | ||
69 | return NULL; | ||
70 | } | ||
71 | |||
72 | /* | ||
73 | * Release the block device | ||
74 | */ | ||
75 | int nfs4_blkdev_put(struct block_device *bdev) | ||
76 | { | ||
77 | dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), | ||
78 | MINOR(bdev->bd_dev)); | ||
79 | return blkdev_put(bdev, FMODE_READ); | ||
80 | } | ||
81 | |||
82 | static struct bl_dev_msg bl_mount_reply; | ||
83 | |||
84 | ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, | ||
85 | size_t mlen) | ||
86 | { | ||
87 | if (mlen != sizeof (struct bl_dev_msg)) | ||
88 | return -EINVAL; | ||
89 | |||
90 | if (copy_from_user(&bl_mount_reply, src, mlen) != 0) | ||
91 | return -EFAULT; | ||
92 | |||
93 | wake_up(&bl_wq); | ||
94 | |||
95 | return mlen; | ||
96 | } | ||
97 | |||
98 | void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) | ||
99 | { | ||
100 | if (msg->errno >= 0) | ||
101 | return; | ||
102 | wake_up(&bl_wq); | ||
103 | } | ||
104 | |||
105 | /* | ||
106 | * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. | ||
107 | */ | ||
108 | struct pnfs_block_dev * | ||
109 | nfs4_blk_decode_device(struct nfs_server *server, | ||
110 | struct pnfs_device *dev) | ||
111 | { | ||
112 | struct pnfs_block_dev *rv; | ||
113 | struct block_device *bd = NULL; | ||
114 | struct rpc_pipe_msg msg; | ||
115 | struct bl_msg_hdr bl_msg = { | ||
116 | .type = BL_DEVICE_MOUNT, | ||
117 | .totallen = dev->mincount, | ||
118 | }; | ||
119 | uint8_t *dataptr; | ||
120 | DECLARE_WAITQUEUE(wq, current); | ||
121 | struct bl_dev_msg *reply = &bl_mount_reply; | ||
122 | int offset, len, i, rc; | ||
123 | |||
124 | dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); | ||
125 | dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, | ||
126 | dev->mincount); | ||
127 | |||
128 | memset(&msg, 0, sizeof(msg)); | ||
129 | msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); | ||
130 | if (!msg.data) { | ||
131 | rv = ERR_PTR(-ENOMEM); | ||
132 | goto out; | ||
133 | } | ||
134 | |||
135 | memcpy(msg.data, &bl_msg, sizeof(bl_msg)); | ||
136 | dataptr = (uint8_t *) msg.data; | ||
137 | len = dev->mincount; | ||
138 | offset = sizeof(bl_msg); | ||
139 | for (i = 0; len > 0; i++) { | ||
140 | memcpy(&dataptr[offset], page_address(dev->pages[i]), | ||
141 | len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); | ||
142 | len -= PAGE_CACHE_SIZE; | ||
143 | offset += PAGE_CACHE_SIZE; | ||
144 | } | ||
145 | msg.len = sizeof(bl_msg) + dev->mincount; | ||
146 | |||
147 | dprintk("%s CALLING USERSPACE DAEMON\n", __func__); | ||
148 | add_wait_queue(&bl_wq, &wq); | ||
149 | rc = rpc_queue_upcall(bl_device_pipe->d_inode, &msg); | ||
150 | if (rc < 0) { | ||
151 | remove_wait_queue(&bl_wq, &wq); | ||
152 | rv = ERR_PTR(rc); | ||
153 | goto out; | ||
154 | } | ||
155 | |||
156 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
157 | schedule(); | ||
158 | __set_current_state(TASK_RUNNING); | ||
159 | remove_wait_queue(&bl_wq, &wq); | ||
160 | |||
161 | if (reply->status != BL_DEVICE_REQUEST_PROC) { | ||
162 | dprintk("%s failed to open device: %d\n", | ||
163 | __func__, reply->status); | ||
164 | rv = ERR_PTR(-EINVAL); | ||
165 | goto out; | ||
166 | } | ||
167 | |||
168 | bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); | ||
169 | if (IS_ERR(bd)) { | ||
170 | rc = PTR_ERR(bd); | ||
171 | dprintk("%s failed to open device : %d\n", __func__, rc); | ||
172 | rv = ERR_PTR(rc); | ||
173 | goto out; | ||
174 | } | ||
175 | |||
176 | rv = kzalloc(sizeof(*rv), GFP_NOFS); | ||
177 | if (!rv) { | ||
178 | rv = ERR_PTR(-ENOMEM); | ||
179 | goto out; | ||
180 | } | ||
181 | |||
182 | rv->bm_mdev = bd; | ||
183 | memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); | ||
184 | dprintk("%s Created device %s with bd_block_size %u\n", | ||
185 | __func__, | ||
186 | bd->bd_disk->disk_name, | ||
187 | bd->bd_block_size); | ||
188 | |||
189 | out: | ||
190 | kfree(msg.data); | ||
191 | return rv; | ||
192 | } | ||
193 | |||
194 | /* Map deviceid returned by the server to constructed block_device */ | ||
195 | static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, | ||
196 | struct nfs4_deviceid *id) | ||
197 | { | ||
198 | struct block_device *rv = NULL; | ||
199 | struct block_mount_id *mid; | ||
200 | struct pnfs_block_dev *dev; | ||
201 | |||
202 | dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); | ||
203 | mid = BLK_ID(lo); | ||
204 | spin_lock(&mid->bm_lock); | ||
205 | list_for_each_entry(dev, &mid->bm_devlist, bm_node) { | ||
206 | if (memcmp(id->data, dev->bm_mdevid.data, | ||
207 | NFS4_DEVICEID4_SIZE) == 0) { | ||
208 | rv = dev->bm_mdev; | ||
209 | goto out; | ||
210 | } | ||
211 | } | ||
212 | out: | ||
213 | spin_unlock(&mid->bm_lock); | ||
214 | dprintk("%s returning %p\n", __func__, rv); | ||
215 | return rv; | ||
216 | } | ||
217 | |||
218 | /* Tracks info needed to ensure extents in layout obey constraints of spec */ | ||
219 | struct layout_verification { | ||
220 | u32 mode; /* R or RW */ | ||
221 | u64 start; /* Expected start of next non-COW extent */ | ||
222 | u64 inval; /* Start of INVAL coverage */ | ||
223 | u64 cowread; /* End of COW read coverage */ | ||
224 | }; | ||
225 | |||
226 | /* Verify the extent meets the layout requirements of the pnfs-block draft, | ||
227 | * section 2.3.1. | ||
228 | */ | ||
229 | static int verify_extent(struct pnfs_block_extent *be, | ||
230 | struct layout_verification *lv) | ||
231 | { | ||
232 | if (lv->mode == IOMODE_READ) { | ||
233 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA || | ||
234 | be->be_state == PNFS_BLOCK_INVALID_DATA) | ||
235 | return -EIO; | ||
236 | if (be->be_f_offset != lv->start) | ||
237 | return -EIO; | ||
238 | lv->start += be->be_length; | ||
239 | return 0; | ||
240 | } | ||
241 | /* lv->mode == IOMODE_RW */ | ||
242 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { | ||
243 | if (be->be_f_offset != lv->start) | ||
244 | return -EIO; | ||
245 | if (lv->cowread > lv->start) | ||
246 | return -EIO; | ||
247 | lv->start += be->be_length; | ||
248 | lv->inval = lv->start; | ||
249 | return 0; | ||
250 | } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | ||
251 | if (be->be_f_offset != lv->start) | ||
252 | return -EIO; | ||
253 | lv->start += be->be_length; | ||
254 | return 0; | ||
255 | } else if (be->be_state == PNFS_BLOCK_READ_DATA) { | ||
256 | if (be->be_f_offset > lv->start) | ||
257 | return -EIO; | ||
258 | if (be->be_f_offset < lv->inval) | ||
259 | return -EIO; | ||
260 | if (be->be_f_offset < lv->cowread) | ||
261 | return -EIO; | ||
262 | /* It looks like you might want to min this with lv->start, | ||
263 | * but you really don't. | ||
264 | */ | ||
265 | lv->inval = lv->inval + be->be_length; | ||
266 | lv->cowread = be->be_f_offset + be->be_length; | ||
267 | return 0; | ||
268 | } else | ||
269 | return -EIO; | ||
270 | } | ||
271 | |||
272 | /* XDR decode pnfs_block_layout4 structure */ | ||
273 | int | ||
274 | nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, | ||
275 | struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) | ||
276 | { | ||
277 | struct pnfs_block_layout *bl = BLK_LO2EXT(lo); | ||
278 | int i, status = -EIO; | ||
279 | uint32_t count; | ||
280 | struct pnfs_block_extent *be = NULL, *save; | ||
281 | struct xdr_stream stream; | ||
282 | struct xdr_buf buf; | ||
283 | struct page *scratch; | ||
284 | __be32 *p; | ||
285 | struct layout_verification lv = { | ||
286 | .mode = lgr->range.iomode, | ||
287 | .start = lgr->range.offset >> SECTOR_SHIFT, | ||
288 | .inval = lgr->range.offset >> SECTOR_SHIFT, | ||
289 | .cowread = lgr->range.offset >> SECTOR_SHIFT, | ||
290 | }; | ||
291 | LIST_HEAD(extents); | ||
292 | |||
293 | dprintk("---> %s\n", __func__); | ||
294 | |||
295 | scratch = alloc_page(gfp_flags); | ||
296 | if (!scratch) | ||
297 | return -ENOMEM; | ||
298 | |||
299 | xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); | ||
300 | xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); | ||
301 | |||
302 | p = xdr_inline_decode(&stream, 4); | ||
303 | if (unlikely(!p)) | ||
304 | goto out_err; | ||
305 | |||
306 | count = be32_to_cpup(p++); | ||
307 | |||
308 | dprintk("%s enter, number of extents %i\n", __func__, count); | ||
309 | p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); | ||
310 | if (unlikely(!p)) | ||
311 | goto out_err; | ||
312 | |||
313 | /* Decode individual extents, putting them in temporary | ||
314 | * staging area until whole layout is decoded to make error | ||
315 | * recovery easier. | ||
316 | */ | ||
317 | for (i = 0; i < count; i++) { | ||
318 | be = bl_alloc_extent(); | ||
319 | if (!be) { | ||
320 | status = -ENOMEM; | ||
321 | goto out_err; | ||
322 | } | ||
323 | memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); | ||
324 | p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); | ||
325 | be->be_mdev = translate_devid(lo, &be->be_devid); | ||
326 | if (!be->be_mdev) | ||
327 | goto out_err; | ||
328 | |||
329 | /* The next three values are read in as bytes, | ||
330 | * but stored as 512-byte sector lengths | ||
331 | */ | ||
332 | if (decode_sector_number(&p, &be->be_f_offset) < 0) | ||
333 | goto out_err; | ||
334 | if (decode_sector_number(&p, &be->be_length) < 0) | ||
335 | goto out_err; | ||
336 | if (decode_sector_number(&p, &be->be_v_offset) < 0) | ||
337 | goto out_err; | ||
338 | be->be_state = be32_to_cpup(p++); | ||
339 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) | ||
340 | be->be_inval = &bl->bl_inval; | ||
341 | if (verify_extent(be, &lv)) { | ||
342 | dprintk("%s verify failed\n", __func__); | ||
343 | goto out_err; | ||
344 | } | ||
345 | list_add_tail(&be->be_node, &extents); | ||
346 | } | ||
347 | if (lgr->range.offset + lgr->range.length != | ||
348 | lv.start << SECTOR_SHIFT) { | ||
349 | dprintk("%s Final length mismatch\n", __func__); | ||
350 | be = NULL; | ||
351 | goto out_err; | ||
352 | } | ||
353 | if (lv.start < lv.cowread) { | ||
354 | dprintk("%s Final uncovered COW extent\n", __func__); | ||
355 | be = NULL; | ||
356 | goto out_err; | ||
357 | } | ||
358 | /* Extents decoded properly, now try to merge them in to | ||
359 | * existing layout extents. | ||
360 | */ | ||
361 | spin_lock(&bl->bl_ext_lock); | ||
362 | list_for_each_entry_safe(be, save, &extents, be_node) { | ||
363 | list_del(&be->be_node); | ||
364 | status = bl_add_merge_extent(bl, be); | ||
365 | if (status) { | ||
366 | spin_unlock(&bl->bl_ext_lock); | ||
367 | /* This is a fairly catastrophic error, as the | ||
368 | * entire layout extent lists are now corrupted. | ||
369 | * We should have some way to distinguish this. | ||
370 | */ | ||
371 | be = NULL; | ||
372 | goto out_err; | ||
373 | } | ||
374 | } | ||
375 | spin_unlock(&bl->bl_ext_lock); | ||
376 | status = 0; | ||
377 | out: | ||
378 | __free_page(scratch); | ||
379 | dprintk("%s returns %i\n", __func__, status); | ||
380 | return status; | ||
381 | |||
382 | out_err: | ||
383 | bl_put_extent(be); | ||
384 | while (!list_empty(&extents)) { | ||
385 | be = list_first_entry(&extents, struct pnfs_block_extent, | ||
386 | be_node); | ||
387 | list_del(&be->be_node); | ||
388 | bl_put_extent(be); | ||
389 | } | ||
390 | goto out; | ||
391 | } | ||