diff options
Diffstat (limited to 'fs/nfs/blocklayout/blocklayoutdev.c')
| -rw-r--r-- | fs/nfs/blocklayout/blocklayoutdev.c | 410 |
1 files changed, 410 insertions, 0 deletions
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c new file mode 100644 index 000000000000..a83b393fb01c --- /dev/null +++ b/fs/nfs/blocklayout/blocklayoutdev.c | |||
| @@ -0,0 +1,410 @@ | |||
| 1 | /* | ||
| 2 | * linux/fs/nfs/blocklayout/blocklayoutdev.c | ||
| 3 | * | ||
| 4 | * Device operations for the pnfs nfs4 file layout driver. | ||
| 5 | * | ||
| 6 | * Copyright (c) 2006 The Regents of the University of Michigan. | ||
| 7 | * All rights reserved. | ||
| 8 | * | ||
| 9 | * Andy Adamson <andros@citi.umich.edu> | ||
| 10 | * Fred Isaman <iisaman@umich.edu> | ||
| 11 | * | ||
| 12 | * permission is granted to use, copy, create derivative works and | ||
| 13 | * redistribute this software and such derivative works for any purpose, | ||
| 14 | * so long as the name of the university of michigan is not used in | ||
| 15 | * any advertising or publicity pertaining to the use or distribution | ||
| 16 | * of this software without specific, written prior authorization. if | ||
| 17 | * the above copyright notice or any other identification of the | ||
| 18 | * university of michigan is included in any copy of any portion of | ||
| 19 | * this software, then the disclaimer below must also be included. | ||
| 20 | * | ||
| 21 | * this software is provided as is, without representation from the | ||
| 22 | * university of michigan as to its fitness for any purpose, and without | ||
| 23 | * warranty by the university of michigan of any kind, either express | ||
| 24 | * or implied, including without limitation the implied warranties of | ||
| 25 | * merchantability and fitness for a particular purpose. the regents | ||
| 26 | * of the university of michigan shall not be liable for any damages, | ||
| 27 | * including special, indirect, incidental, or consequential damages, | ||
| 28 | * with respect to any claim arising out or in connection with the use | ||
| 29 | * of the software, even if it has been or is hereafter advised of the | ||
| 30 | * possibility of such damages. | ||
| 31 | */ | ||
| 32 | #include <linux/module.h> | ||
| 33 | #include <linux/buffer_head.h> /* __bread */ | ||
| 34 | |||
| 35 | #include <linux/genhd.h> | ||
| 36 | #include <linux/blkdev.h> | ||
| 37 | #include <linux/hash.h> | ||
| 38 | |||
| 39 | #include "blocklayout.h" | ||
| 40 | |||
| 41 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | ||
| 42 | |||
| 43 | static int decode_sector_number(__be32 **rp, sector_t *sp) | ||
| 44 | { | ||
| 45 | uint64_t s; | ||
| 46 | |||
| 47 | *rp = xdr_decode_hyper(*rp, &s); | ||
| 48 | if (s & 0x1ff) { | ||
| 49 | printk(KERN_WARNING "%s: sector not aligned\n", __func__); | ||
| 50 | return -1; | ||
| 51 | } | ||
| 52 | *sp = s >> SECTOR_SHIFT; | ||
| 53 | return 0; | ||
| 54 | } | ||
| 55 | |||
| 56 | /* Open a block_device by device number. */ | ||
| 57 | struct block_device *nfs4_blkdev_get(dev_t dev) | ||
| 58 | { | ||
| 59 | struct block_device *bd; | ||
| 60 | |||
| 61 | dprintk("%s enter\n", __func__); | ||
| 62 | bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); | ||
| 63 | if (IS_ERR(bd)) | ||
| 64 | goto fail; | ||
| 65 | return bd; | ||
| 66 | fail: | ||
| 67 | dprintk("%s failed to open device : %ld\n", | ||
| 68 | __func__, PTR_ERR(bd)); | ||
| 69 | return NULL; | ||
| 70 | } | ||
| 71 | |||
| 72 | /* | ||
| 73 | * Release the block device | ||
| 74 | */ | ||
| 75 | int nfs4_blkdev_put(struct block_device *bdev) | ||
| 76 | { | ||
| 77 | dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), | ||
| 78 | MINOR(bdev->bd_dev)); | ||
| 79 | return blkdev_put(bdev, FMODE_READ); | ||
| 80 | } | ||
| 81 | |||
| 82 | /* | ||
| 83 | * Shouldn't there be a rpc_generic_upcall() to do this for us? | ||
| 84 | */ | ||
| 85 | ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, | ||
| 86 | char __user *dst, size_t buflen) | ||
| 87 | { | ||
| 88 | char *data = (char *)msg->data + msg->copied; | ||
| 89 | size_t mlen = min(msg->len - msg->copied, buflen); | ||
| 90 | unsigned long left; | ||
| 91 | |||
| 92 | left = copy_to_user(dst, data, mlen); | ||
| 93 | if (left == mlen) { | ||
| 94 | msg->errno = -EFAULT; | ||
| 95 | return -EFAULT; | ||
| 96 | } | ||
| 97 | |||
| 98 | mlen -= left; | ||
| 99 | msg->copied += mlen; | ||
| 100 | msg->errno = 0; | ||
| 101 | return mlen; | ||
| 102 | } | ||
| 103 | |||
| 104 | static struct bl_dev_msg bl_mount_reply; | ||
| 105 | |||
| 106 | ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, | ||
| 107 | size_t mlen) | ||
| 108 | { | ||
| 109 | if (mlen != sizeof (struct bl_dev_msg)) | ||
| 110 | return -EINVAL; | ||
| 111 | |||
| 112 | if (copy_from_user(&bl_mount_reply, src, mlen) != 0) | ||
| 113 | return -EFAULT; | ||
| 114 | |||
| 115 | wake_up(&bl_wq); | ||
| 116 | |||
| 117 | return mlen; | ||
| 118 | } | ||
| 119 | |||
| 120 | void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) | ||
| 121 | { | ||
| 122 | if (msg->errno >= 0) | ||
| 123 | return; | ||
| 124 | wake_up(&bl_wq); | ||
| 125 | } | ||
| 126 | |||
| 127 | /* | ||
| 128 | * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. | ||
| 129 | */ | ||
| 130 | struct pnfs_block_dev * | ||
| 131 | nfs4_blk_decode_device(struct nfs_server *server, | ||
| 132 | struct pnfs_device *dev) | ||
| 133 | { | ||
| 134 | struct pnfs_block_dev *rv = NULL; | ||
| 135 | struct block_device *bd = NULL; | ||
| 136 | struct rpc_pipe_msg msg; | ||
| 137 | struct bl_msg_hdr bl_msg = { | ||
| 138 | .type = BL_DEVICE_MOUNT, | ||
| 139 | .totallen = dev->mincount, | ||
| 140 | }; | ||
| 141 | uint8_t *dataptr; | ||
| 142 | DECLARE_WAITQUEUE(wq, current); | ||
| 143 | struct bl_dev_msg *reply = &bl_mount_reply; | ||
| 144 | int offset, len, i; | ||
| 145 | |||
| 146 | dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); | ||
| 147 | dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, | ||
| 148 | dev->mincount); | ||
| 149 | |||
| 150 | memset(&msg, 0, sizeof(msg)); | ||
| 151 | msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); | ||
| 152 | if (!msg.data) { | ||
| 153 | rv = ERR_PTR(-ENOMEM); | ||
| 154 | goto out; | ||
| 155 | } | ||
| 156 | |||
| 157 | memcpy(msg.data, &bl_msg, sizeof(bl_msg)); | ||
| 158 | dataptr = (uint8_t *) msg.data; | ||
| 159 | len = dev->mincount; | ||
| 160 | offset = sizeof(bl_msg); | ||
| 161 | for (i = 0; len > 0; i++) { | ||
| 162 | memcpy(&dataptr[offset], page_address(dev->pages[i]), | ||
| 163 | len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); | ||
| 164 | len -= PAGE_CACHE_SIZE; | ||
| 165 | offset += PAGE_CACHE_SIZE; | ||
| 166 | } | ||
| 167 | msg.len = sizeof(bl_msg) + dev->mincount; | ||
| 168 | |||
| 169 | dprintk("%s CALLING USERSPACE DAEMON\n", __func__); | ||
| 170 | add_wait_queue(&bl_wq, &wq); | ||
| 171 | if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { | ||
| 172 | remove_wait_queue(&bl_wq, &wq); | ||
| 173 | goto out; | ||
| 174 | } | ||
| 175 | |||
| 176 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 177 | schedule(); | ||
| 178 | __set_current_state(TASK_RUNNING); | ||
| 179 | remove_wait_queue(&bl_wq, &wq); | ||
| 180 | |||
| 181 | if (reply->status != BL_DEVICE_REQUEST_PROC) { | ||
| 182 | dprintk("%s failed to open device: %d\n", | ||
| 183 | __func__, reply->status); | ||
| 184 | rv = ERR_PTR(-EINVAL); | ||
| 185 | goto out; | ||
| 186 | } | ||
| 187 | |||
| 188 | bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); | ||
| 189 | if (IS_ERR(bd)) { | ||
| 190 | dprintk("%s failed to open device : %ld\n", | ||
| 191 | __func__, PTR_ERR(bd)); | ||
| 192 | goto out; | ||
| 193 | } | ||
| 194 | |||
| 195 | rv = kzalloc(sizeof(*rv), GFP_NOFS); | ||
| 196 | if (!rv) { | ||
| 197 | rv = ERR_PTR(-ENOMEM); | ||
| 198 | goto out; | ||
| 199 | } | ||
| 200 | |||
| 201 | rv->bm_mdev = bd; | ||
| 202 | memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); | ||
| 203 | dprintk("%s Created device %s with bd_block_size %u\n", | ||
| 204 | __func__, | ||
| 205 | bd->bd_disk->disk_name, | ||
| 206 | bd->bd_block_size); | ||
| 207 | |||
| 208 | out: | ||
| 209 | kfree(msg.data); | ||
| 210 | return rv; | ||
| 211 | } | ||
| 212 | |||
| 213 | /* Map deviceid returned by the server to constructed block_device */ | ||
| 214 | static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, | ||
| 215 | struct nfs4_deviceid *id) | ||
| 216 | { | ||
| 217 | struct block_device *rv = NULL; | ||
| 218 | struct block_mount_id *mid; | ||
| 219 | struct pnfs_block_dev *dev; | ||
| 220 | |||
| 221 | dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); | ||
| 222 | mid = BLK_ID(lo); | ||
| 223 | spin_lock(&mid->bm_lock); | ||
| 224 | list_for_each_entry(dev, &mid->bm_devlist, bm_node) { | ||
| 225 | if (memcmp(id->data, dev->bm_mdevid.data, | ||
| 226 | NFS4_DEVICEID4_SIZE) == 0) { | ||
| 227 | rv = dev->bm_mdev; | ||
| 228 | goto out; | ||
| 229 | } | ||
| 230 | } | ||
| 231 | out: | ||
| 232 | spin_unlock(&mid->bm_lock); | ||
| 233 | dprintk("%s returning %p\n", __func__, rv); | ||
| 234 | return rv; | ||
| 235 | } | ||
| 236 | |||
| 237 | /* Tracks info needed to ensure extents in layout obey constraints of spec */ | ||
| 238 | struct layout_verification { | ||
| 239 | u32 mode; /* R or RW */ | ||
| 240 | u64 start; /* Expected start of next non-COW extent */ | ||
| 241 | u64 inval; /* Start of INVAL coverage */ | ||
| 242 | u64 cowread; /* End of COW read coverage */ | ||
| 243 | }; | ||
| 244 | |||
| 245 | /* Verify the extent meets the layout requirements of the pnfs-block draft, | ||
| 246 | * section 2.3.1. | ||
| 247 | */ | ||
| 248 | static int verify_extent(struct pnfs_block_extent *be, | ||
| 249 | struct layout_verification *lv) | ||
| 250 | { | ||
| 251 | if (lv->mode == IOMODE_READ) { | ||
| 252 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA || | ||
| 253 | be->be_state == PNFS_BLOCK_INVALID_DATA) | ||
| 254 | return -EIO; | ||
| 255 | if (be->be_f_offset != lv->start) | ||
| 256 | return -EIO; | ||
| 257 | lv->start += be->be_length; | ||
| 258 | return 0; | ||
| 259 | } | ||
| 260 | /* lv->mode == IOMODE_RW */ | ||
| 261 | if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { | ||
| 262 | if (be->be_f_offset != lv->start) | ||
| 263 | return -EIO; | ||
| 264 | if (lv->cowread > lv->start) | ||
| 265 | return -EIO; | ||
| 266 | lv->start += be->be_length; | ||
| 267 | lv->inval = lv->start; | ||
| 268 | return 0; | ||
| 269 | } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | ||
| 270 | if (be->be_f_offset != lv->start) | ||
| 271 | return -EIO; | ||
| 272 | lv->start += be->be_length; | ||
| 273 | return 0; | ||
| 274 | } else if (be->be_state == PNFS_BLOCK_READ_DATA) { | ||
| 275 | if (be->be_f_offset > lv->start) | ||
| 276 | return -EIO; | ||
| 277 | if (be->be_f_offset < lv->inval) | ||
| 278 | return -EIO; | ||
| 279 | if (be->be_f_offset < lv->cowread) | ||
| 280 | return -EIO; | ||
| 281 | /* It looks like you might want to min this with lv->start, | ||
| 282 | * but you really don't. | ||
| 283 | */ | ||
| 284 | lv->inval = lv->inval + be->be_length; | ||
| 285 | lv->cowread = be->be_f_offset + be->be_length; | ||
| 286 | return 0; | ||
| 287 | } else | ||
| 288 | return -EIO; | ||
| 289 | } | ||
| 290 | |||
| 291 | /* XDR decode pnfs_block_layout4 structure */ | ||
| 292 | int | ||
| 293 | nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, | ||
| 294 | struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) | ||
| 295 | { | ||
| 296 | struct pnfs_block_layout *bl = BLK_LO2EXT(lo); | ||
| 297 | int i, status = -EIO; | ||
| 298 | uint32_t count; | ||
| 299 | struct pnfs_block_extent *be = NULL, *save; | ||
| 300 | struct xdr_stream stream; | ||
| 301 | struct xdr_buf buf; | ||
| 302 | struct page *scratch; | ||
| 303 | __be32 *p; | ||
| 304 | struct layout_verification lv = { | ||
| 305 | .mode = lgr->range.iomode, | ||
| 306 | .start = lgr->range.offset >> SECTOR_SHIFT, | ||
| 307 | .inval = lgr->range.offset >> SECTOR_SHIFT, | ||
| 308 | .cowread = lgr->range.offset >> SECTOR_SHIFT, | ||
| 309 | }; | ||
| 310 | LIST_HEAD(extents); | ||
| 311 | |||
| 312 | dprintk("---> %s\n", __func__); | ||
| 313 | |||
| 314 | scratch = alloc_page(gfp_flags); | ||
| 315 | if (!scratch) | ||
| 316 | return -ENOMEM; | ||
| 317 | |||
| 318 | xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); | ||
| 319 | xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); | ||
| 320 | |||
| 321 | p = xdr_inline_decode(&stream, 4); | ||
| 322 | if (unlikely(!p)) | ||
| 323 | goto out_err; | ||
| 324 | |||
| 325 | count = be32_to_cpup(p++); | ||
| 326 | |||
| 327 | dprintk("%s enter, number of extents %i\n", __func__, count); | ||
| 328 | p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); | ||
| 329 | if (unlikely(!p)) | ||
| 330 | goto out_err; | ||
| 331 | |||
| 332 | /* Decode individual extents, putting them in temporary | ||
| 333 | * staging area until whole layout is decoded to make error | ||
| 334 | * recovery easier. | ||
| 335 | */ | ||
| 336 | for (i = 0; i < count; i++) { | ||
| 337 | be = bl_alloc_extent(); | ||
| 338 | if (!be) { | ||
| 339 | status = -ENOMEM; | ||
| 340 | goto out_err; | ||
| 341 | } | ||
| 342 | memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); | ||
| 343 | p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); | ||
| 344 | be->be_mdev = translate_devid(lo, &be->be_devid); | ||
| 345 | if (!be->be_mdev) | ||
| 346 | goto out_err; | ||
| 347 | |||
| 348 | /* The next three values are read in as bytes, | ||
| 349 | * but stored as 512-byte sector lengths | ||
| 350 | */ | ||
| 351 | if (decode_sector_number(&p, &be->be_f_offset) < 0) | ||
| 352 | goto out_err; | ||
| 353 | if (decode_sector_number(&p, &be->be_length) < 0) | ||
| 354 | goto out_err; | ||
| 355 | if (decode_sector_number(&p, &be->be_v_offset) < 0) | ||
| 356 | goto out_err; | ||
| 357 | be->be_state = be32_to_cpup(p++); | ||
| 358 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) | ||
| 359 | be->be_inval = &bl->bl_inval; | ||
| 360 | if (verify_extent(be, &lv)) { | ||
| 361 | dprintk("%s verify failed\n", __func__); | ||
| 362 | goto out_err; | ||
| 363 | } | ||
| 364 | list_add_tail(&be->be_node, &extents); | ||
| 365 | } | ||
| 366 | if (lgr->range.offset + lgr->range.length != | ||
| 367 | lv.start << SECTOR_SHIFT) { | ||
| 368 | dprintk("%s Final length mismatch\n", __func__); | ||
| 369 | be = NULL; | ||
| 370 | goto out_err; | ||
| 371 | } | ||
| 372 | if (lv.start < lv.cowread) { | ||
| 373 | dprintk("%s Final uncovered COW extent\n", __func__); | ||
| 374 | be = NULL; | ||
| 375 | goto out_err; | ||
| 376 | } | ||
| 377 | /* Extents decoded properly, now try to merge them in to | ||
| 378 | * existing layout extents. | ||
| 379 | */ | ||
| 380 | spin_lock(&bl->bl_ext_lock); | ||
| 381 | list_for_each_entry_safe(be, save, &extents, be_node) { | ||
| 382 | list_del(&be->be_node); | ||
| 383 | status = bl_add_merge_extent(bl, be); | ||
| 384 | if (status) { | ||
| 385 | spin_unlock(&bl->bl_ext_lock); | ||
| 386 | /* This is a fairly catastrophic error, as the | ||
| 387 | * entire layout extent lists are now corrupted. | ||
| 388 | * We should have some way to distinguish this. | ||
| 389 | */ | ||
| 390 | be = NULL; | ||
| 391 | goto out_err; | ||
| 392 | } | ||
| 393 | } | ||
| 394 | spin_unlock(&bl->bl_ext_lock); | ||
| 395 | status = 0; | ||
| 396 | out: | ||
| 397 | __free_page(scratch); | ||
| 398 | dprintk("%s returns %i\n", __func__, status); | ||
| 399 | return status; | ||
| 400 | |||
| 401 | out_err: | ||
| 402 | bl_put_extent(be); | ||
| 403 | while (!list_empty(&extents)) { | ||
| 404 | be = list_first_entry(&extents, struct pnfs_block_extent, | ||
| 405 | be_node); | ||
| 406 | list_del(&be->be_node); | ||
| 407 | bl_put_extent(be); | ||
| 408 | } | ||
| 409 | goto out; | ||
| 410 | } | ||
