aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c487
1 files changed, 463 insertions, 24 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 460dd512eebd..1695440a59a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -698,6 +698,38 @@ retry:
698 return 0; 698 return 0;
699} 699}
700 700
701static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
702 u64 num_bytes)
703{
704 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
705 struct extent_map *em;
706 u64 alloc_hint = 0;
707
708 read_lock(&em_tree->lock);
709 em = search_extent_mapping(em_tree, start, num_bytes);
710 if (em) {
711 /*
712 * if block start isn't an actual block number then find the
713 * first block in this inode and use that as a hint. If that
714 * block is also bogus then just don't worry about it.
715 */
716 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
717 free_extent_map(em);
718 em = search_extent_mapping(em_tree, 0, 0);
719 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
720 alloc_hint = em->block_start;
721 if (em)
722 free_extent_map(em);
723 } else {
724 alloc_hint = em->block_start;
725 free_extent_map(em);
726 }
727 }
728 read_unlock(&em_tree->lock);
729
730 return alloc_hint;
731}
732
701/* 733/*
702 * when extent_io.c finds a delayed allocation range in the file, 734 * when extent_io.c finds a delayed allocation range in the file,
703 * the call backs end up in this code. The basic idea is to 735 * the call backs end up in this code. The basic idea is to
@@ -770,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
770 BUG_ON(disk_num_bytes > 802 BUG_ON(disk_num_bytes >
771 btrfs_super_total_bytes(&root->fs_info->super_copy)); 803 btrfs_super_total_bytes(&root->fs_info->super_copy));
772 804
773 805 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
774 read_lock(&BTRFS_I(inode)->extent_tree.lock);
775 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
776 start, num_bytes);
777 if (em) {
778 /*
779 * if block start isn't an actual block number then find the
780 * first block in this inode and use that as a hint. If that
781 * block is also bogus then just don't worry about it.
782 */
783 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
784 free_extent_map(em);
785 em = search_extent_mapping(em_tree, 0, 0);
786 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
787 alloc_hint = em->block_start;
788 if (em)
789 free_extent_map(em);
790 } else {
791 alloc_hint = em->block_start;
792 free_extent_map(em);
793 }
794 }
795 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
796 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 806 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
797 807
798 while (disk_num_bytes > 0) { 808 while (disk_num_bytes > 0) {
@@ -5171,11 +5181,440 @@ out:
5171 return em; 5181 return em;
5172} 5182}
5173 5183
5184static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5185 u64 start, u64 len)
5186{
5187 struct btrfs_root *root = BTRFS_I(inode)->root;
5188 struct btrfs_trans_handle *trans;
5189 struct extent_map *em;
5190 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5191 struct btrfs_key ins;
5192 u64 alloc_hint;
5193 int ret;
5194
5195 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5196
5197 trans = btrfs_join_transaction(root, 0);
5198 if (!trans)
5199 return ERR_PTR(-ENOMEM);
5200
5201 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5202
5203 alloc_hint = get_extent_allocation_hint(inode, start, len);
5204 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
5205 alloc_hint, (u64)-1, &ins, 1);
5206 if (ret) {
5207 em = ERR_PTR(ret);
5208 goto out;
5209 }
5210
5211 em = alloc_extent_map(GFP_NOFS);
5212 if (!em) {
5213 em = ERR_PTR(-ENOMEM);
5214 goto out;
5215 }
5216
5217 em->start = start;
5218 em->orig_start = em->start;
5219 em->len = ins.offset;
5220
5221 em->block_start = ins.objectid;
5222 em->block_len = ins.offset;
5223 em->bdev = root->fs_info->fs_devices->latest_bdev;
5224 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5225
5226 while (1) {
5227 write_lock(&em_tree->lock);
5228 ret = add_extent_mapping(em_tree, em);
5229 write_unlock(&em_tree->lock);
5230 if (ret != -EEXIST)
5231 break;
5232 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5233 }
5234
5235 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5236 ins.offset, ins.offset, 0);
5237 if (ret) {
5238 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
5239 em = ERR_PTR(ret);
5240 }
5241out:
5242 btrfs_end_transaction(trans, root);
5243 return em;
5244}
5245
5246static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5247 struct buffer_head *bh_result, int create)
5248{
5249 struct extent_map *em;
5250 struct btrfs_root *root = BTRFS_I(inode)->root;
5251 u64 start = iblock << inode->i_blkbits;
5252 u64 len = bh_result->b_size;
5253
5254 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5255 if (IS_ERR(em))
5256 return PTR_ERR(em);
5257
5258 /*
5259 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
5260 * io. INLINE is special, and we could probably kludge it in here, but
5261 * it's still buffered so for safety lets just fall back to the generic
5262 * buffered path.
5263 *
5264 * For COMPRESSED we _have_ to read the entire extent in so we can
5265 * decompress it, so there will be buffering required no matter what we
5266 * do, so go ahead and fallback to buffered.
5267 *
5268 * We return -ENOTBLK because thats what makes DIO go ahead and go back
5269 * to buffered IO. Don't blame me, this is the price we pay for using
5270 * the generic code.
5271 */
5272 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5273 em->block_start == EXTENT_MAP_INLINE) {
5274 free_extent_map(em);
5275 return -ENOTBLK;
5276 }
5277
5278 /* Just a good old fashioned hole, return */
5279 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5280 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5281 free_extent_map(em);
5282 /* DIO will do one hole at a time, so just unlock a sector */
5283 unlock_extent(&BTRFS_I(inode)->io_tree, start,
5284 start + root->sectorsize - 1, GFP_NOFS);
5285 return 0;
5286 }
5287
5288 /*
5289 * We don't allocate a new extent in the following cases
5290 *
5291 * 1) The inode is marked as NODATACOW. In this case we'll just use the
5292 * existing extent.
5293 * 2) The extent is marked as PREALLOC. We're good to go here and can
5294 * just use the extent.
5295 *
5296 */
5297 if (!create)
5298 goto map;
5299
5300 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
5301 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
5302 em->block_start != EXTENT_MAP_HOLE)) {
5303 u64 block_start;
5304 int type;
5305 int ret;
5306
5307 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5308 type = BTRFS_ORDERED_PREALLOC;
5309 else
5310 type = BTRFS_ORDERED_NOCOW;
5311 len = min(len, em->block_len - (start - em->start));
5312 block_start = em->block_start + (start - em->start);
5313 ret = btrfs_add_ordered_extent_dio(inode, start,
5314 start, len, len, type);
5315 if (ret) {
5316 free_extent_map(em);
5317 return ret;
5318 }
5319 } else {
5320 free_extent_map(em);
5321 em = btrfs_new_extent_direct(inode, start, len);
5322 if (IS_ERR(em))
5323 return PTR_ERR(em);
5324 len = min(len, em->block_len);
5325 }
5326 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5327 GFP_NOFS);
5328map:
5329 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5330 inode->i_blkbits;
5331 bh_result->b_size = em->len - (start - em->start);
5332 bh_result->b_bdev = em->bdev;
5333 set_buffer_mapped(bh_result);
5334 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5335 set_buffer_new(bh_result);
5336
5337 free_extent_map(em);
5338
5339 return 0;
5340}
5341
5342struct btrfs_dio_private {
5343 struct inode *inode;
5344 u64 logical_offset;
5345 u64 disk_bytenr;
5346 u64 bytes;
5347 u32 *csums;
5348 void *private;
5349};
5350
5351static void btrfs_endio_direct_read(struct bio *bio, int err)
5352{
5353 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5354 struct bio_vec *bvec = bio->bi_io_vec;
5355 struct btrfs_dio_private *dip = bio->bi_private;
5356 struct inode *inode = dip->inode;
5357 struct btrfs_root *root = BTRFS_I(inode)->root;
5358 u64 start;
5359 u32 *private = dip->csums;
5360
5361 start = dip->logical_offset;
5362 do {
5363 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
5364 struct page *page = bvec->bv_page;
5365 char *kaddr;
5366 u32 csum = ~(u32)0;
5367 unsigned long flags;
5368
5369 local_irq_save(flags);
5370 kaddr = kmap_atomic(page, KM_IRQ0);
5371 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
5372 csum, bvec->bv_len);
5373 btrfs_csum_final(csum, (char *)&csum);
5374 kunmap_atomic(kaddr, KM_IRQ0);
5375 local_irq_restore(flags);
5376
5377 flush_dcache_page(bvec->bv_page);
5378 if (csum != *private) {
5379 printk(KERN_ERR "btrfs csum failed ino %lu off"
5380 " %llu csum %u private %u\n",
5381 inode->i_ino, (unsigned long long)start,
5382 csum, *private);
5383 err = -EIO;
5384 }
5385 }
5386
5387 start += bvec->bv_len;
5388 private++;
5389 bvec++;
5390 } while (bvec <= bvec_end);
5391
5392 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
5393 dip->logical_offset + dip->bytes - 1, GFP_NOFS);
5394 bio->bi_private = dip->private;
5395
5396 kfree(dip->csums);
5397 kfree(dip);
5398 dio_end_io(bio, err);
5399}
5400
5401static void btrfs_endio_direct_write(struct bio *bio, int err)
5402{
5403 struct btrfs_dio_private *dip = bio->bi_private;
5404 struct inode *inode = dip->inode;
5405 struct btrfs_root *root = BTRFS_I(inode)->root;
5406 struct btrfs_trans_handle *trans;
5407 struct btrfs_ordered_extent *ordered = NULL;
5408 struct extent_state *cached_state = NULL;
5409 int ret;
5410
5411 if (err)
5412 goto out_done;
5413
5414 ret = btrfs_dec_test_ordered_pending(inode, &ordered,
5415 dip->logical_offset, dip->bytes);
5416 if (!ret)
5417 goto out_done;
5418
5419 BUG_ON(!ordered);
5420
5421 trans = btrfs_join_transaction(root, 1);
5422 if (!trans) {
5423 err = -ENOMEM;
5424 goto out;
5425 }
5426 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5427
5428 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5429 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5430 if (!ret)
5431 ret = btrfs_update_inode(trans, root, inode);
5432 err = ret;
5433 goto out;
5434 }
5435
5436 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5437 ordered->file_offset + ordered->len - 1, 0,
5438 &cached_state, GFP_NOFS);
5439
5440 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5441 ret = btrfs_mark_extent_written(trans, inode,
5442 ordered->file_offset,
5443 ordered->file_offset +
5444 ordered->len);
5445 if (ret) {
5446 err = ret;
5447 goto out_unlock;
5448 }
5449 } else {
5450 ret = insert_reserved_file_extent(trans, inode,
5451 ordered->file_offset,
5452 ordered->start,
5453 ordered->disk_len,
5454 ordered->len,
5455 ordered->len,
5456 0, 0, 0,
5457 BTRFS_FILE_EXTENT_REG);
5458 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5459 ordered->file_offset, ordered->len);
5460 if (ret) {
5461 err = ret;
5462 WARN_ON(1);
5463 goto out_unlock;
5464 }
5465 }
5466
5467 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5468 btrfs_ordered_update_i_size(inode, 0, ordered);
5469 btrfs_update_inode(trans, root, inode);
5470out_unlock:
5471 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5472 ordered->file_offset + ordered->len - 1,
5473 &cached_state, GFP_NOFS);
5474out:
5475 btrfs_delalloc_release_metadata(inode, ordered->len);
5476 btrfs_end_transaction(trans, root);
5477 btrfs_put_ordered_extent(ordered);
5478 btrfs_put_ordered_extent(ordered);
5479out_done:
5480 bio->bi_private = dip->private;
5481
5482 kfree(dip->csums);
5483 kfree(dip);
5484 dio_end_io(bio, err);
5485}
5486
5487static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5488 loff_t file_offset)
5489{
5490 struct btrfs_root *root = BTRFS_I(inode)->root;
5491 struct btrfs_dio_private *dip;
5492 struct bio_vec *bvec = bio->bi_io_vec;
5493 u64 start;
5494 int skip_sum;
5495 int write = rw & (1 << BIO_RW);
5496 int ret = 0;
5497
5498 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
5499
5500 dip = kmalloc(sizeof(*dip), GFP_NOFS);
5501 if (!dip) {
5502 ret = -ENOMEM;
5503 goto free_ordered;
5504 }
5505 dip->csums = NULL;
5506
5507 if (!skip_sum) {
5508 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5509 if (!dip->csums) {
5510 ret = -ENOMEM;
5511 goto free_ordered;
5512 }
5513 }
5514
5515 dip->private = bio->bi_private;
5516 dip->inode = inode;
5517 dip->logical_offset = file_offset;
5518
5519 start = dip->logical_offset;
5520 dip->bytes = 0;
5521 do {
5522 dip->bytes += bvec->bv_len;
5523 bvec++;
5524 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
5525
5526 dip->disk_bytenr = bio->bi_sector << 9;
5527 bio->bi_private = dip;
5528
5529 if (write)
5530 bio->bi_end_io = btrfs_endio_direct_write;
5531 else
5532 bio->bi_end_io = btrfs_endio_direct_read;
5533
5534 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5535 if (ret)
5536 goto out_err;
5537
5538 if (write && !skip_sum)
5539 btrfs_csum_one_bio(root, inode, bio, dip->logical_offset, 1);
5540 else if (!skip_sum)
5541 btrfs_lookup_bio_sums_dio(root, inode, bio,
5542 dip->logical_offset, dip->csums);
5543
5544 ret = btrfs_map_bio(root, rw, bio, 0, 0);
5545 if (ret)
5546 goto out_err;
5547 return;
5548out_err:
5549 kfree(dip->csums);
5550 kfree(dip);
5551free_ordered:
5552 /*
5553 * If this is a write, we need to clean up the reserved space and kill
5554 * the ordered extent.
5555 */
5556 if (write) {
5557 struct btrfs_ordered_extent *ordered;
5558 ordered = btrfs_lookup_ordered_extent(inode,
5559 dip->logical_offset);
5560 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5561 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5562 btrfs_free_reserved_extent(root, ordered->start,
5563 ordered->disk_len);
5564 btrfs_put_ordered_extent(ordered);
5565 btrfs_put_ordered_extent(ordered);
5566 }
5567 bio_endio(bio, ret);
5568}
5569
5174static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 5570static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
5175 const struct iovec *iov, loff_t offset, 5571 const struct iovec *iov, loff_t offset,
5176 unsigned long nr_segs) 5572 unsigned long nr_segs)
5177{ 5573{
5178 return -EINVAL; 5574 struct file *file = iocb->ki_filp;
5575 struct inode *inode = file->f_mapping->host;
5576 struct btrfs_ordered_extent *ordered;
5577 u64 lockstart, lockend;
5578 ssize_t ret;
5579
5580 lockstart = offset;
5581 lockend = offset + iov_length(iov, nr_segs) - 1;
5582 while (1) {
5583 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5584 GFP_NOFS);
5585 /*
5586 * We're concerned with the entire range that we're going to be
5587 * doing DIO to, so we need to make sure theres no ordered
5588 * extents in this range.
5589 */
5590 ordered = btrfs_lookup_ordered_range(inode, lockstart,
5591 lockend - lockstart + 1);
5592 if (!ordered)
5593 break;
5594 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5595 GFP_NOFS);
5596 btrfs_start_ordered_extent(inode, ordered, 1);
5597 btrfs_put_ordered_extent(ordered);
5598 cond_resched();
5599 }
5600
5601 ret = __blockdev_direct_IO(rw, iocb, inode, NULL, iov, offset, nr_segs,
5602 btrfs_get_blocks_direct, NULL,
5603 btrfs_submit_direct, 0);
5604
5605 if (ret < 0 && ret != -EIOCBQUEUED) {
5606 unlock_extent(&BTRFS_I(inode)->io_tree, offset,
5607 offset + iov_length(iov, nr_segs) - 1, GFP_NOFS);
5608 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
5609 /*
5610 * We're falling back to buffered, unlock the section we didn't
5611 * do IO on.
5612 */
5613 unlock_extent(&BTRFS_I(inode)->io_tree, offset + ret,
5614 offset + iov_length(iov, nr_segs) - 1, GFP_NOFS);
5615 }
5616
5617 return ret;
5179} 5618}
5180 5619
5181static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 5620static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,