aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorJosef Bacik <josef@redhat.com>2010-05-23 11:00:55 -0400
committerChris Mason <chris.mason@oracle.com>2010-05-25 10:34:57 -0400
commit4b46fce23349bfca781a32e2707a18328ca5ae22 (patch)
tree68f1200f2bc82d3f35218aef38e6d5d92bff4aca /fs/btrfs/inode.c
parentc2c6ca417e2db7a519e6e92c82f4a933d940d076 (diff)
Btrfs: add basic DIO read/write support
This provides basic DIO support for reading and writing. It does not do the work to recover from mismatching checksums, that will come later. A few design changes have been made from Jim's code (sorry Jim!) 1) Use the generic direct-io code. Jim originally re-wrote all the generic DIO code in order to account for all of BTRFS's oddities, but thanks to that work it seems like the best bet is to just ignore compression and such and just opt to fallback on buffered IO. 2) Fallback on buffered IO for compressed or inline extents. Jim's code did it's own buffering to make dio with compressed extents work. Now we just fallback onto normal buffered IO. 3) Use ordered extents for the writes so that all of the lock_extent() lookup_ordered() type checks continue to work. 4) Do the lock_extent() lookup_ordered() loop in readpage so we don't race with DIO writes. I've tested this with fsx and everything works great. This patch depends on my dio and filemap.c patches to work. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c487
1 files changed, 463 insertions, 24 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 460dd512eebd..1695440a59a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -698,6 +698,38 @@ retry:
698 return 0; 698 return 0;
699} 699}
700 700
701static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
702 u64 num_bytes)
703{
704 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
705 struct extent_map *em;
706 u64 alloc_hint = 0;
707
708 read_lock(&em_tree->lock);
709 em = search_extent_mapping(em_tree, start, num_bytes);
710 if (em) {
711 /*
712 * if block start isn't an actual block number then find the
713 * first block in this inode and use that as a hint. If that
714 * block is also bogus then just don't worry about it.
715 */
716 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
717 free_extent_map(em);
718 em = search_extent_mapping(em_tree, 0, 0);
719 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
720 alloc_hint = em->block_start;
721 if (em)
722 free_extent_map(em);
723 } else {
724 alloc_hint = em->block_start;
725 free_extent_map(em);
726 }
727 }
728 read_unlock(&em_tree->lock);
729
730 return alloc_hint;
731}
732
701/* 733/*
702 * when extent_io.c finds a delayed allocation range in the file, 734 * when extent_io.c finds a delayed allocation range in the file,
703 * the call backs end up in this code. The basic idea is to 735 * the call backs end up in this code. The basic idea is to
@@ -770,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
770 BUG_ON(disk_num_bytes > 802 BUG_ON(disk_num_bytes >
771 btrfs_super_total_bytes(&root->fs_info->super_copy)); 803 btrfs_super_total_bytes(&root->fs_info->super_copy));
772 804
773 805 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
774 read_lock(&BTRFS_I(inode)->extent_tree.lock);
775 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
776 start, num_bytes);
777 if (em) {
778 /*
779 * if block start isn't an actual block number then find the
780 * first block in this inode and use that as a hint. If that
781 * block is also bogus then just don't worry about it.
782 */
783 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
784 free_extent_map(em);
785 em = search_extent_mapping(em_tree, 0, 0);
786 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
787 alloc_hint = em->block_start;
788 if (em)
789 free_extent_map(em);
790 } else {
791 alloc_hint = em->block_start;
792 free_extent_map(em);
793 }
794 }
795 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
796 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 806 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
797 807
798 while (disk_num_bytes > 0) { 808 while (disk_num_bytes > 0) {
@@ -5171,11 +5181,440 @@ out:
5171 return em; 5181 return em;
5172} 5182}
5173 5183
5184static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5185 u64 start, u64 len)
5186{
5187 struct btrfs_root *root = BTRFS_I(inode)->root;
5188 struct btrfs_trans_handle *trans;
5189 struct extent_map *em;
5190 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5191 struct btrfs_key ins;
5192 u64 alloc_hint;
5193 int ret;
5194
5195 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5196
5197 trans = btrfs_join_transaction(root, 0);
5198 if (!trans)
5199 return ERR_PTR(-ENOMEM);
5200
5201 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5202
5203 alloc_hint = get_extent_allocation_hint(inode, start, len);
5204 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
5205 alloc_hint, (u64)-1, &ins, 1);
5206 if (ret) {
5207 em = ERR_PTR(ret);
5208 goto out;
5209 }
5210
5211 em = alloc_extent_map(GFP_NOFS);
5212 if (!em) {
5213 em = ERR_PTR(-ENOMEM);
5214 goto out;
5215 }
5216
5217 em->start = start;
5218 em->orig_start = em->start;
5219 em->len = ins.offset;
5220
5221 em->block_start = ins.objectid;
5222 em->block_len = ins.offset;
5223 em->bdev = root->fs_info->fs_devices->latest_bdev;
5224 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5225
5226 while (1) {
5227 write_lock(&em_tree->lock);
5228 ret = add_extent_mapping(em_tree, em);
5229 write_unlock(&em_tree->lock);
5230 if (ret != -EEXIST)
5231 break;
5232 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5233 }
5234
5235 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5236 ins.offset, ins.offset, 0);
5237 if (ret) {
5238 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
5239 em = ERR_PTR(ret);
5240 }
5241out:
5242 btrfs_end_transaction(trans, root);
5243 return em;
5244}
5245
5246static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5247 struct buffer_head *bh_result, int create)
5248{
5249 struct extent_map *em;
5250 struct btrfs_root *root = BTRFS_I(inode)->root;
5251 u64 start = iblock << inode->i_blkbits;
5252 u64 len = bh_result->b_size;
5253
5254 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5255 if (IS_ERR(em))
5256 return PTR_ERR(em);
5257
5258 /*
5259 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
5260 * io. INLINE is special, and we could probably kludge it in here, but
5261 * it's still buffered so for safety lets just fall back to the generic
5262 * buffered path.
5263 *
5264 * For COMPRESSED we _have_ to read the entire extent in so we can
5265 * decompress it, so there will be buffering required no matter what we
5266 * do, so go ahead and fallback to buffered.
5267 *
5268 * We return -ENOTBLK because thats what makes DIO go ahead and go back
5269 * to buffered IO. Don't blame me, this is the price we pay for using
5270 * the generic code.
5271 */
5272 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5273 em->block_start == EXTENT_MAP_INLINE) {
5274 free_extent_map(em);
5275 return -ENOTBLK;
5276 }
5277
5278 /* Just a good old fashioned hole, return */
5279 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5280 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5281 free_extent_map(em);
5282 /* DIO will do one hole at a time, so just unlock a sector */
5283 unlock_extent(&BTRFS_I(inode)->io_tree, start,
5284 start + root->sectorsize - 1, GFP_NOFS);
5285 return 0;
5286 }
5287
5288 /*
5289 * We don't allocate a new extent in the following cases
5290 *
5291 * 1) The inode is marked as NODATACOW. In this case we'll just use the
5292 * existing extent.
5293 * 2) The extent is marked as PREALLOC. We're good to go here and can
5294 * just use the extent.
5295 *
5296 */
5297 if (!create)
5298 goto map;
5299
5300 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
5301 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
5302 em->block_start != EXTENT_MAP_HOLE)) {
5303 u64 block_start;
5304 int type;
5305 int ret;
5306
5307 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5308 type = BTRFS_ORDERED_PREALLOC;
5309 else
5310 type = BTRFS_ORDERED_NOCOW;
5311 len = min(len, em->block_len - (start - em->start));
5312 block_start = em->block_start + (start - em->start);
5313 ret = btrfs_add_ordered_extent_dio(inode, start,
5314 start, len, len, type);
5315 if (ret) {
5316 free_extent_map(em);
5317 return ret;
5318 }
5319 } else {
5320 free_extent_map(em);
5321 em = btrfs_new_extent_direct(inode, start, len);
5322 if (IS_ERR(em))
5323 return PTR_ERR(em);
5324 len = min(len, em->block_len);
5325 }
5326 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5327 GFP_NOFS);
5328map:
5329 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5330 inode->i_blkbits;
5331 bh_result->b_size = em->len - (start - em->start);
5332 bh_result->b_bdev = em->bdev;
5333 set_buffer_mapped(bh_result);
5334 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5335 set_buffer_new(bh_result);
5336
5337 free_extent_map(em);
5338
5339 return 0;
5340}
5341
5342struct btrfs_dio_private {
5343 struct inode *inode;
5344 u64 logical_offset;
5345 u64 disk_bytenr;
5346 u64 bytes;
5347 u32 *csums;
5348 void *private;
5349};
5350
5351static void btrfs_endio_direct_read(struct bio *bio, int err)
5352{
5353 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5354 struct bio_vec *bvec = bio->bi_io_vec;
5355 struct btrfs_dio_private *dip = bio->bi_private;
5356 struct inode *inode = dip->inode;
5357 struct btrfs_root *root = BTRFS_I(inode)->root;
5358 u64 start;
5359 u32 *private = dip->csums;
5360
5361 start = dip->logical_offset;
5362 do {
5363 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
5364 struct page *page = bvec->bv_page;
5365 char *kaddr;
5366 u32 csum = ~(u32)0;
5367 unsigned long flags;
5368
5369 local_irq_save(flags);
5370 kaddr = kmap_atomic(page, KM_IRQ0);
5371 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
5372 csum, bvec->bv_len);
5373 btrfs_csum_final(csum, (char *)&csum);
5374 kunmap_atomic(kaddr, KM_IRQ0);
5375 local_irq_restore(flags);
5376
5377 flush_dcache_page(bvec->bv_page);
5378 if (csum != *private) {
5379 printk(KERN_ERR "btrfs csum failed ino %lu off"
5380 " %llu csum %u private %u\n",
5381 inode->i_ino, (unsigned long long)start,
5382 csum, *private);
5383 err = -EIO;
5384 }
5385 }
5386
5387 start += bvec->bv_len;
5388 private++;
5389 bvec++;
5390 } while (bvec <= bvec_end);
5391
5392 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
5393 dip->logical_offset + dip->bytes - 1, GFP_NOFS);
5394 bio->bi_private = dip->private;
5395
5396 kfree(dip->csums);
5397 kfree(dip);
5398 dio_end_io(bio, err);
5399}
5400
5401static void btrfs_endio_direct_write(struct bio *bio, int err)
5402{
5403 struct btrfs_dio_private *dip = bio->bi_private;
5404 struct inode *inode = dip->inode;
5405 struct btrfs_root *root = BTRFS_I(inode)->root;
5406 struct btrfs_trans_handle *trans;
5407 struct btrfs_ordered_extent *ordered = NULL;
5408 struct extent_state *cached_state = NULL;
5409 int ret;
5410
5411 if (err)
5412 goto out_done;
5413
5414 ret = btrfs_dec_test_ordered_pending(inode, &ordered,
5415 dip->logical_offset, dip->bytes);
5416 if (!ret)
5417 goto out_done;
5418
5419 BUG_ON(!ordered);
5420
5421 trans = btrfs_join_transaction(root, 1);
5422 if (!trans) {
5423 err = -ENOMEM;
5424 goto out;
5425 }
5426 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5427
5428 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5429 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5430 if (!ret)
5431 ret = btrfs_update_inode(trans, root, inode);
5432 err = ret;
5433 goto out;
5434 }
5435
5436 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5437 ordered->file_offset + ordered->len - 1, 0,
5438 &cached_state, GFP_NOFS);
5439
5440 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5441 ret = btrfs_mark_extent_written(trans, inode,
5442 ordered->file_offset,
5443 ordered->file_offset +
5444 ordered->len);
5445 if (ret) {
5446 err = ret;
5447 goto out_unlock;
5448 }
5449 } else {
5450 ret = insert_reserved_file_extent(trans, inode,
5451 ordered->file_offset,
5452 ordered->start,
5453 ordered->disk_len,
5454 ordered->len,
5455 ordered->len,
5456 0, 0, 0,
5457 BTRFS_FILE_EXTENT_REG);
5458 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5459 ordered->file_offset, ordered->len);
5460 if (ret) {
5461 err = ret;
5462 WARN_ON(1);
5463 goto out_unlock;
5464 }
5465 }
5466
5467 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5468 btrfs_ordered_update_i_size(inode, 0, ordered);
5469 btrfs_update_inode(trans, root, inode);
5470out_unlock:
5471 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5472 ordered->file_offset + ordered->len - 1,
5473 &cached_state, GFP_NOFS);
5474out:
5475 btrfs_delalloc_release_metadata(inode, ordered->len);
5476 btrfs_end_transaction(trans, root);
5477 btrfs_put_ordered_extent(ordered);
5478 btrfs_put_ordered_extent(ordered);
5479out_done:
5480 bio->bi_private = dip->private;
5481
5482 kfree(dip->csums);
5483 kfree(dip);
5484 dio_end_io(bio, err);
5485}
5486
5487static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5488 loff_t file_offset)
5489{
5490 struct btrfs_root *root = BTRFS_I(inode)->root;
5491 struct btrfs_dio_private *dip;
5492 struct bio_vec *bvec = bio->bi_io_vec;
5493 u64 start;
5494 int skip_sum;
5495 int write = rw & (1 << BIO_RW);
5496 int ret = 0;
5497
5498 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
5499
5500 dip = kmalloc(sizeof(*dip), GFP_NOFS);
5501 if (!dip) {
5502 ret = -ENOMEM;
5503 goto free_ordered;
5504 }
5505 dip->csums = NULL;
5506
5507 if (!skip_sum) {
5508 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5509 if (!dip->csums) {
5510 ret = -ENOMEM;
5511 goto free_ordered;
5512 }
5513 }
5514
5515 dip->private = bio->bi_private;
5516 dip->inode = inode;
5517 dip->logical_offset = file_offset;
5518
5519 start = dip->logical_offset;
5520 dip->bytes = 0;
5521 do {
5522 dip->bytes += bvec->bv_len;
5523 bvec++;
5524 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
5525
5526 dip->disk_bytenr = bio->bi_sector << 9;
5527 bio->bi_private = dip;
5528
5529 if (write)
5530 bio->bi_end_io = btrfs_endio_direct_write;
5531 else
5532 bio->bi_end_io = btrfs_endio_direct_read;
5533
5534 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5535 if (ret)
5536 goto out_err;
5537
5538 if (write && !skip_sum)
5539 btrfs_csum_one_bio(root, inode, bio, dip->logical_offset, 1);
5540 else if (!skip_sum)
5541 btrfs_lookup_bio_sums_dio(root, inode, bio,
5542 dip->logical_offset, dip->csums);
5543
5544 ret = btrfs_map_bio(root, rw, bio, 0, 0);
5545 if (ret)
5546 goto out_err;
5547 return;
5548out_err:
5549 kfree(dip->csums);
5550 kfree(dip);
5551free_ordered:
5552 /*
5553 * If this is a write, we need to clean up the reserved space and kill
5554 * the ordered extent.
5555 */
5556 if (write) {
5557 struct btrfs_ordered_extent *ordered;
5558 ordered = btrfs_lookup_ordered_extent(inode,
5559 dip->logical_offset);
5560 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5561 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5562 btrfs_free_reserved_extent(root, ordered->start,
5563 ordered->disk_len);
5564 btrfs_put_ordered_extent(ordered);
5565 btrfs_put_ordered_extent(ordered);
5566 }
5567 bio_endio(bio, ret);
5568}
5569
5174static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 5570static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
5175 const struct iovec *iov, loff_t offset, 5571 const struct iovec *iov, loff_t offset,
5176 unsigned long nr_segs) 5572 unsigned long nr_segs)
5177{ 5573{
5178 return -EINVAL; 5574 struct file *file = iocb->ki_filp;
5575 struct inode *inode = file->f_mapping->host;
5576 struct btrfs_ordered_extent *ordered;
5577 u64 lockstart, lockend;
5578 ssize_t ret;
5579
5580 lockstart = offset;
5581 lockend = offset + iov_length(iov, nr_segs) - 1;
5582 while (1) {
5583 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5584 GFP_NOFS);
5585 /*
5586 * We're concerned with the entire range that we're going to be
5587 * doing DIO to, so we need to make sure theres no ordered
5588 * extents in this range.
5589 */
5590 ordered = btrfs_lookup_ordered_range(inode, lockstart,
5591 lockend - lockstart + 1);
5592 if (!ordered)
5593 break;
5594 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5595 GFP_NOFS);
5596 btrfs_start_ordered_extent(inode, ordered, 1);
5597 btrfs_put_ordered_extent(ordered);
5598 cond_resched();
5599 }
5600
5601 ret = __blockdev_direct_IO(rw, iocb, inode, NULL, iov, offset, nr_segs,
5602 btrfs_get_blocks_direct, NULL,
5603 btrfs_submit_direct, 0);
5604
5605 if (ret < 0 && ret != -EIOCBQUEUED) {
5606 unlock_extent(&BTRFS_I(inode)->io_tree, offset,
5607 offset + iov_length(iov, nr_segs) - 1, GFP_NOFS);
5608 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
5609 /*
5610 * We're falling back to buffered, unlock the section we didn't
5611 * do IO on.
5612 */
5613 unlock_extent(&BTRFS_I(inode)->io_tree, offset + ret,
5614 offset + iov_length(iov, nr_segs) - 1, GFP_NOFS);
5615 }
5616
5617 return ret;
5179} 5618}
5180 5619
5181static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 5620static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,