aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c684
1 files changed, 508 insertions, 176 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 016c403bfe7e..d23362f4464e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -153,7 +153,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
153 153
154 key.objectid = btrfs_ino(inode); 154 key.objectid = btrfs_ino(inode);
155 key.offset = start; 155 key.offset = start;
156 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 156 key.type = BTRFS_EXTENT_DATA_KEY;
157 157
158 datasize = btrfs_file_extent_calc_inline_size(cur_size); 158 datasize = btrfs_file_extent_calc_inline_size(cur_size);
159 path->leave_spinning = 1; 159 path->leave_spinning = 1;
@@ -249,8 +249,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
249 data_len = compressed_size; 249 data_len = compressed_size;
250 250
251 if (start > 0 || 251 if (start > 0 ||
252 actual_end >= PAGE_CACHE_SIZE || 252 actual_end > PAGE_CACHE_SIZE ||
253 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 253 data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
254 (!compressed_size && 254 (!compressed_size &&
255 (actual_end & (root->sectorsize - 1)) == 0) || 255 (actual_end & (root->sectorsize - 1)) == 0) ||
256 end + 1 < isize || 256 end + 1 < isize ||
@@ -348,6 +348,23 @@ static noinline int add_async_extent(struct async_cow *cow,
348 return 0; 348 return 0;
349} 349}
350 350
351static inline int inode_need_compress(struct inode *inode)
352{
353 struct btrfs_root *root = BTRFS_I(inode)->root;
354
355 /* force compress */
356 if (btrfs_test_opt(root, FORCE_COMPRESS))
357 return 1;
358 /* bad compression ratios */
359 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
360 return 0;
361 if (btrfs_test_opt(root, COMPRESS) ||
362 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
363 BTRFS_I(inode)->force_compress)
364 return 1;
365 return 0;
366}
367
351/* 368/*
352 * we create compressed extents in two phases. The first 369 * we create compressed extents in two phases. The first
353 * phase compresses a range of pages that have already been 370 * phase compresses a range of pages that have already been
@@ -444,10 +461,7 @@ again:
444 * inode has not been flagged as nocompress. This flag can 461 * inode has not been flagged as nocompress. This flag can
445 * change at any time if we discover bad compression ratios. 462 * change at any time if we discover bad compression ratios.
446 */ 463 */
447 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 464 if (inode_need_compress(inode)) {
448 (btrfs_test_opt(root, COMPRESS) ||
449 (BTRFS_I(inode)->force_compress) ||
450 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
451 WARN_ON(pages); 465 WARN_ON(pages);
452 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 466 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
453 if (!pages) { 467 if (!pages) {
@@ -1094,7 +1108,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1094 async_cow->locked_page = locked_page; 1108 async_cow->locked_page = locked_page;
1095 async_cow->start = start; 1109 async_cow->start = start;
1096 1110
1097 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 1111 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1112 !btrfs_test_opt(root, FORCE_COMPRESS))
1098 cur_end = end; 1113 cur_end = end;
1099 else 1114 else
1100 cur_end = min(end, start + 512 * 1024 - 1); 1115 cur_end = min(end, start + 512 * 1024 - 1);
@@ -1445,6 +1460,26 @@ error:
1445 return ret; 1460 return ret;
1446} 1461}
1447 1462
1463static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1464{
1465
1466 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1467 !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1468 return 0;
1469
1470 /*
1471 * @defrag_bytes is a hint value, no spinlock held here,
1472 * if is not zero, it means the file is defragging.
1473 * Force cow if given extent needs to be defragged.
1474 */
1475 if (BTRFS_I(inode)->defrag_bytes &&
1476 test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1477 EXTENT_DEFRAG, 0, NULL))
1478 return 1;
1479
1480 return 0;
1481}
1482
1448/* 1483/*
1449 * extent_io.c call back to do delayed allocation processing 1484 * extent_io.c call back to do delayed allocation processing
1450 */ 1485 */
@@ -1453,17 +1488,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1453 unsigned long *nr_written) 1488 unsigned long *nr_written)
1454{ 1489{
1455 int ret; 1490 int ret;
1456 struct btrfs_root *root = BTRFS_I(inode)->root; 1491 int force_cow = need_force_cow(inode, start, end);
1457 1492
1458 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { 1493 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1459 ret = run_delalloc_nocow(inode, locked_page, start, end, 1494 ret = run_delalloc_nocow(inode, locked_page, start, end,
1460 page_started, 1, nr_written); 1495 page_started, 1, nr_written);
1461 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { 1496 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1462 ret = run_delalloc_nocow(inode, locked_page, start, end, 1497 ret = run_delalloc_nocow(inode, locked_page, start, end,
1463 page_started, 0, nr_written); 1498 page_started, 0, nr_written);
1464 } else if (!btrfs_test_opt(root, COMPRESS) && 1499 } else if (!inode_need_compress(inode)) {
1465 !(BTRFS_I(inode)->force_compress) &&
1466 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
1467 ret = cow_file_range(inode, locked_page, start, end, 1500 ret = cow_file_range(inode, locked_page, start, end,
1468 page_started, nr_written, 1); 1501 page_started, nr_written, 1);
1469 } else { 1502 } else {
@@ -1555,6 +1588,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
1555 struct extent_state *state, unsigned long *bits) 1588 struct extent_state *state, unsigned long *bits)
1556{ 1589{
1557 1590
1591 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1592 WARN_ON(1);
1558 /* 1593 /*
1559 * set_bit and clear bit hooks normally require _irqsave/restore 1594 * set_bit and clear bit hooks normally require _irqsave/restore
1560 * but in this case, we are only testing for the DELALLOC 1595 * but in this case, we are only testing for the DELALLOC
@@ -1577,6 +1612,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
1577 root->fs_info->delalloc_batch); 1612 root->fs_info->delalloc_batch);
1578 spin_lock(&BTRFS_I(inode)->lock); 1613 spin_lock(&BTRFS_I(inode)->lock);
1579 BTRFS_I(inode)->delalloc_bytes += len; 1614 BTRFS_I(inode)->delalloc_bytes += len;
1615 if (*bits & EXTENT_DEFRAG)
1616 BTRFS_I(inode)->defrag_bytes += len;
1580 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1617 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1581 &BTRFS_I(inode)->runtime_flags)) 1618 &BTRFS_I(inode)->runtime_flags))
1582 btrfs_add_delalloc_inodes(root, inode); 1619 btrfs_add_delalloc_inodes(root, inode);
@@ -1591,6 +1628,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1591 struct extent_state *state, 1628 struct extent_state *state,
1592 unsigned long *bits) 1629 unsigned long *bits)
1593{ 1630{
1631 u64 len = state->end + 1 - state->start;
1632
1633 spin_lock(&BTRFS_I(inode)->lock);
1634 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
1635 BTRFS_I(inode)->defrag_bytes -= len;
1636 spin_unlock(&BTRFS_I(inode)->lock);
1637
1594 /* 1638 /*
1595 * set_bit and clear bit hooks normally require _irqsave/restore 1639 * set_bit and clear bit hooks normally require _irqsave/restore
1596 * but in this case, we are only testing for the DELALLOC 1640 * but in this case, we are only testing for the DELALLOC
@@ -1598,7 +1642,6 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1598 */ 1642 */
1599 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1643 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1600 struct btrfs_root *root = BTRFS_I(inode)->root; 1644 struct btrfs_root *root = BTRFS_I(inode)->root;
1601 u64 len = state->end + 1 - state->start;
1602 bool do_list = !btrfs_is_free_space_inode(inode); 1645 bool do_list = !btrfs_is_free_space_inode(inode);
1603 1646
1604 if (*bits & EXTENT_FIRST_DELALLOC) { 1647 if (*bits & EXTENT_FIRST_DELALLOC) {
@@ -2660,6 +2703,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2660 goto out; 2703 goto out;
2661 } 2704 }
2662 2705
2706 btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
2707 ordered_extent->file_offset +
2708 ordered_extent->len - 1);
2709
2663 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { 2710 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2664 truncated = true; 2711 truncated = true;
2665 logical_len = ordered_extent->truncated_len; 2712 logical_len = ordered_extent->truncated_len;
@@ -2856,6 +2903,40 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2856 return 0; 2903 return 0;
2857} 2904}
2858 2905
2906static int __readpage_endio_check(struct inode *inode,
2907 struct btrfs_io_bio *io_bio,
2908 int icsum, struct page *page,
2909 int pgoff, u64 start, size_t len)
2910{
2911 char *kaddr;
2912 u32 csum_expected;
2913 u32 csum = ~(u32)0;
2914 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2915 DEFAULT_RATELIMIT_BURST);
2916
2917 csum_expected = *(((u32 *)io_bio->csum) + icsum);
2918
2919 kaddr = kmap_atomic(page);
2920 csum = btrfs_csum_data(kaddr + pgoff, csum, len);
2921 btrfs_csum_final(csum, (char *)&csum);
2922 if (csum != csum_expected)
2923 goto zeroit;
2924
2925 kunmap_atomic(kaddr);
2926 return 0;
2927zeroit:
2928 if (__ratelimit(&_rs))
2929 btrfs_info(BTRFS_I(inode)->root->fs_info,
2930 "csum failed ino %llu off %llu csum %u expected csum %u",
2931 btrfs_ino(inode), start, csum, csum_expected);
2932 memset(kaddr + pgoff, 1, len);
2933 flush_dcache_page(page);
2934 kunmap_atomic(kaddr);
2935 if (csum_expected == 0)
2936 return 0;
2937 return -EIO;
2938}
2939
2859/* 2940/*
2860 * when reads are done, we need to check csums to verify the data is correct 2941 * when reads are done, we need to check csums to verify the data is correct
2861 * if there's a match, we allow the bio to finish. If not, the code in 2942 * if there's a match, we allow the bio to finish. If not, the code in
@@ -2868,20 +2949,15 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2868 size_t offset = start - page_offset(page); 2949 size_t offset = start - page_offset(page);
2869 struct inode *inode = page->mapping->host; 2950 struct inode *inode = page->mapping->host;
2870 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2951 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2871 char *kaddr;
2872 struct btrfs_root *root = BTRFS_I(inode)->root; 2952 struct btrfs_root *root = BTRFS_I(inode)->root;
2873 u32 csum_expected;
2874 u32 csum = ~(u32)0;
2875 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2876 DEFAULT_RATELIMIT_BURST);
2877 2953
2878 if (PageChecked(page)) { 2954 if (PageChecked(page)) {
2879 ClearPageChecked(page); 2955 ClearPageChecked(page);
2880 goto good; 2956 return 0;
2881 } 2957 }
2882 2958
2883 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 2959 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
2884 goto good; 2960 return 0;
2885 2961
2886 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 2962 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
2887 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 2963 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -2891,28 +2967,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2891 } 2967 }
2892 2968
2893 phy_offset >>= inode->i_sb->s_blocksize_bits; 2969 phy_offset >>= inode->i_sb->s_blocksize_bits;
2894 csum_expected = *(((u32 *)io_bio->csum) + phy_offset); 2970 return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
2895 2971 start, (size_t)(end - start + 1));
2896 kaddr = kmap_atomic(page);
2897 csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1);
2898 btrfs_csum_final(csum, (char *)&csum);
2899 if (csum != csum_expected)
2900 goto zeroit;
2901
2902 kunmap_atomic(kaddr);
2903good:
2904 return 0;
2905
2906zeroit:
2907 if (__ratelimit(&_rs))
2908 btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
2909 btrfs_ino(page->mapping->host), start, csum, csum_expected);
2910 memset(kaddr + offset, 1, end - start + 1);
2911 flush_dcache_page(page);
2912 kunmap_atomic(kaddr);
2913 if (csum_expected == 0)
2914 return 0;
2915 return -EIO;
2916} 2972}
2917 2973
2918struct delayed_iput { 2974struct delayed_iput {
@@ -3159,7 +3215,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3159 path->reada = -1; 3215 path->reada = -1;
3160 3216
3161 key.objectid = BTRFS_ORPHAN_OBJECTID; 3217 key.objectid = BTRFS_ORPHAN_OBJECTID;
3162 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 3218 key.type = BTRFS_ORPHAN_ITEM_KEY;
3163 key.offset = (u64)-1; 3219 key.offset = (u64)-1;
3164 3220
3165 while (1) { 3221 while (1) {
@@ -3186,7 +3242,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3186 /* make sure the item matches what we want */ 3242 /* make sure the item matches what we want */
3187 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 3243 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3188 break; 3244 break;
3189 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) 3245 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3190 break; 3246 break;
3191 3247
3192 /* release the path since we're done with it */ 3248 /* release the path since we're done with it */
@@ -3662,7 +3718,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3662 * without delay 3718 * without delay
3663 */ 3719 */
3664 if (!btrfs_is_free_space_inode(inode) 3720 if (!btrfs_is_free_space_inode(inode)
3665 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 3721 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
3722 && !root->fs_info->log_root_recovering) {
3666 btrfs_update_root_times(trans, root); 3723 btrfs_update_root_times(trans, root);
3667 3724
3668 ret = btrfs_delayed_update_inode(trans, root, inode); 3725 ret = btrfs_delayed_update_inode(trans, root, inode);
@@ -4085,7 +4142,7 @@ search_again:
4085 fi = NULL; 4142 fi = NULL;
4086 leaf = path->nodes[0]; 4143 leaf = path->nodes[0];
4087 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4144 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4088 found_type = btrfs_key_type(&found_key); 4145 found_type = found_key.type;
4089 4146
4090 if (found_key.objectid != ino) 4147 if (found_key.objectid != ino)
4091 break; 4148 break;
@@ -4747,6 +4804,8 @@ void btrfs_evict_inode(struct inode *inode)
4747 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 4804 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
4748 btrfs_wait_ordered_range(inode, 0, (u64)-1); 4805 btrfs_wait_ordered_range(inode, 0, (u64)-1);
4749 4806
4807 btrfs_free_io_failure_record(inode, 0, (u64)-1);
4808
4750 if (root->fs_info->log_root_recovering) { 4809 if (root->fs_info->log_root_recovering) {
4751 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 4810 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
4752 &BTRFS_I(inode)->runtime_flags)); 4811 &BTRFS_I(inode)->runtime_flags));
@@ -5202,42 +5261,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5202 iput(inode); 5261 iput(inode);
5203 inode = ERR_PTR(ret); 5262 inode = ERR_PTR(ret);
5204 } 5263 }
5205 /*
5206 * If orphan cleanup did remove any orphans, it means the tree
5207 * was modified and therefore the commit root is not the same as
5208 * the current root anymore. This is a problem, because send
5209 * uses the commit root and therefore can see inode items that
5210 * don't exist in the current root anymore, and for example make
5211 * calls to btrfs_iget, which will do tree lookups based on the
5212 * current root and not on the commit root. Those lookups will
5213 * fail, returning a -ESTALE error, and making send fail with
5214 * that error. So make sure a send does not see any orphans we
5215 * have just removed, and that it will see the same inodes
5216 * regardless of whether a transaction commit happened before
5217 * it started (meaning that the commit root will be the same as
5218 * the current root) or not.
5219 */
5220 if (sub_root->node != sub_root->commit_root) {
5221 u64 sub_flags = btrfs_root_flags(&sub_root->root_item);
5222
5223 if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) {
5224 struct extent_buffer *eb;
5225
5226 /*
5227 * Assert we can't have races between dentry
5228 * lookup called through the snapshot creation
5229 * ioctl and the VFS.
5230 */
5231 ASSERT(mutex_is_locked(&dir->i_mutex));
5232
5233 down_write(&root->fs_info->commit_root_sem);
5234 eb = sub_root->commit_root;
5235 sub_root->commit_root =
5236 btrfs_root_node(sub_root);
5237 up_write(&root->fs_info->commit_root_sem);
5238 free_extent_buffer(eb);
5239 }
5240 }
5241 } 5264 }
5242 5265
5243 return inode; 5266 return inode;
@@ -5331,7 +5354,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5331 btrfs_get_delayed_items(inode, &ins_list, &del_list); 5354 btrfs_get_delayed_items(inode, &ins_list, &del_list);
5332 } 5355 }
5333 5356
5334 btrfs_set_key_type(&key, key_type); 5357 key.type = key_type;
5335 key.offset = ctx->pos; 5358 key.offset = ctx->pos;
5336 key.objectid = btrfs_ino(inode); 5359 key.objectid = btrfs_ino(inode);
5337 5360
@@ -5356,7 +5379,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5356 5379
5357 if (found_key.objectid != key.objectid) 5380 if (found_key.objectid != key.objectid)
5358 break; 5381 break;
5359 if (btrfs_key_type(&found_key) != key_type) 5382 if (found_key.type != key_type)
5360 break; 5383 break;
5361 if (found_key.offset < ctx->pos) 5384 if (found_key.offset < ctx->pos)
5362 goto next; 5385 goto next;
@@ -5568,7 +5591,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
5568 int ret; 5591 int ret;
5569 5592
5570 key.objectid = btrfs_ino(inode); 5593 key.objectid = btrfs_ino(inode);
5571 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 5594 key.type = BTRFS_DIR_INDEX_KEY;
5572 key.offset = (u64)-1; 5595 key.offset = (u64)-1;
5573 5596
5574 path = btrfs_alloc_path(); 5597 path = btrfs_alloc_path();
@@ -5600,7 +5623,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
5600 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5623 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5601 5624
5602 if (found_key.objectid != btrfs_ino(inode) || 5625 if (found_key.objectid != btrfs_ino(inode) ||
5603 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 5626 found_key.type != BTRFS_DIR_INDEX_KEY) {
5604 BTRFS_I(inode)->index_cnt = 2; 5627 BTRFS_I(inode)->index_cnt = 2;
5605 goto out; 5628 goto out;
5606 } 5629 }
@@ -5718,7 +5741,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5718 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 5741 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
5719 5742
5720 key[0].objectid = objectid; 5743 key[0].objectid = objectid;
5721 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 5744 key[0].type = BTRFS_INODE_ITEM_KEY;
5722 key[0].offset = 0; 5745 key[0].offset = 0;
5723 5746
5724 sizes[0] = sizeof(struct btrfs_inode_item); 5747 sizes[0] = sizeof(struct btrfs_inode_item);
@@ -5731,7 +5754,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5731 * add more hard links than can fit in the ref item. 5754 * add more hard links than can fit in the ref item.
5732 */ 5755 */
5733 key[1].objectid = objectid; 5756 key[1].objectid = objectid;
5734 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 5757 key[1].type = BTRFS_INODE_REF_KEY;
5735 key[1].offset = ref_objectid; 5758 key[1].offset = ref_objectid;
5736 5759
5737 sizes[1] = name_len + sizeof(*ref); 5760 sizes[1] = name_len + sizeof(*ref);
@@ -5740,7 +5763,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5740 location = &BTRFS_I(inode)->location; 5763 location = &BTRFS_I(inode)->location;
5741 location->objectid = objectid; 5764 location->objectid = objectid;
5742 location->offset = 0; 5765 location->offset = 0;
5743 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); 5766 location->type = BTRFS_INODE_ITEM_KEY;
5744 5767
5745 ret = btrfs_insert_inode_locked(inode); 5768 ret = btrfs_insert_inode_locked(inode);
5746 if (ret < 0) 5769 if (ret < 0)
@@ -5832,7 +5855,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
5832 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 5855 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
5833 } else { 5856 } else {
5834 key.objectid = ino; 5857 key.objectid = ino;
5835 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 5858 key.type = BTRFS_INODE_ITEM_KEY;
5836 key.offset = 0; 5859 key.offset = 0;
5837 } 5860 }
5838 5861
@@ -6191,21 +6214,60 @@ out_fail_inode:
6191 goto out_fail; 6214 goto out_fail;
6192} 6215}
6193 6216
6217/* Find next extent map of a given extent map, caller needs to ensure locks */
6218static struct extent_map *next_extent_map(struct extent_map *em)
6219{
6220 struct rb_node *next;
6221
6222 next = rb_next(&em->rb_node);
6223 if (!next)
6224 return NULL;
6225 return container_of(next, struct extent_map, rb_node);
6226}
6227
6228static struct extent_map *prev_extent_map(struct extent_map *em)
6229{
6230 struct rb_node *prev;
6231
6232 prev = rb_prev(&em->rb_node);
6233 if (!prev)
6234 return NULL;
6235 return container_of(prev, struct extent_map, rb_node);
6236}
6237
6194/* helper for btfs_get_extent. Given an existing extent in the tree, 6238/* helper for btfs_get_extent. Given an existing extent in the tree,
6239 * the existing extent is the nearest extent to map_start,
6195 * and an extent that you want to insert, deal with overlap and insert 6240 * and an extent that you want to insert, deal with overlap and insert
6196 * the new extent into the tree. 6241 * the best fitted new extent into the tree.
6197 */ 6242 */
6198static int merge_extent_mapping(struct extent_map_tree *em_tree, 6243static int merge_extent_mapping(struct extent_map_tree *em_tree,
6199 struct extent_map *existing, 6244 struct extent_map *existing,
6200 struct extent_map *em, 6245 struct extent_map *em,
6201 u64 map_start) 6246 u64 map_start)
6202{ 6247{
6248 struct extent_map *prev;
6249 struct extent_map *next;
6250 u64 start;
6251 u64 end;
6203 u64 start_diff; 6252 u64 start_diff;
6204 6253
6205 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 6254 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
6206 start_diff = map_start - em->start; 6255
6207 em->start = map_start; 6256 if (existing->start > map_start) {
6208 em->len = existing->start - em->start; 6257 next = existing;
6258 prev = prev_extent_map(next);
6259 } else {
6260 prev = existing;
6261 next = next_extent_map(prev);
6262 }
6263
6264 start = prev ? extent_map_end(prev) : em->start;
6265 start = max_t(u64, start, em->start);
6266 end = next ? next->start : extent_map_end(em);
6267 end = min_t(u64, end, extent_map_end(em));
6268 start_diff = start - em->start;
6269 em->start = start;
6270 em->len = end - start;
6209 if (em->block_start < EXTENT_MAP_LAST_BYTE && 6271 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6210 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 6272 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
6211 em->block_start += start_diff; 6273 em->block_start += start_diff;
@@ -6333,7 +6395,7 @@ again:
6333 struct btrfs_file_extent_item); 6395 struct btrfs_file_extent_item);
6334 /* are we inside the extent that was found? */ 6396 /* are we inside the extent that was found? */
6335 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6397 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6336 found_type = btrfs_key_type(&found_key); 6398 found_type = found_key.type;
6337 if (found_key.objectid != objectid || 6399 if (found_key.objectid != objectid ||
6338 found_type != BTRFS_EXTENT_DATA_KEY) { 6400 found_type != BTRFS_EXTENT_DATA_KEY) {
6339 /* 6401 /*
@@ -6482,25 +6544,21 @@ insert:
6482 6544
6483 ret = 0; 6545 ret = 0;
6484 6546
6485 existing = lookup_extent_mapping(em_tree, start, len); 6547 existing = search_extent_mapping(em_tree, start, len);
6486 if (existing && (existing->start > start || 6548 /*
6487 existing->start + existing->len <= start)) { 6549 * existing will always be non-NULL, since there must be
6550 * extent causing the -EEXIST.
6551 */
6552 if (start >= extent_map_end(existing) ||
6553 start <= existing->start) {
6554 /*
6555 * The existing extent map is the one nearest to
6556 * the [start, start + len) range which overlaps
6557 */
6558 err = merge_extent_mapping(em_tree, existing,
6559 em, start);
6488 free_extent_map(existing); 6560 free_extent_map(existing);
6489 existing = NULL; 6561 if (err) {
6490 }
6491 if (!existing) {
6492 existing = lookup_extent_mapping(em_tree, em->start,
6493 em->len);
6494 if (existing) {
6495 err = merge_extent_mapping(em_tree, existing,
6496 em, start);
6497 free_extent_map(existing);
6498 if (err) {
6499 free_extent_map(em);
6500 em = NULL;
6501 }
6502 } else {
6503 err = -EIO;
6504 free_extent_map(em); 6562 free_extent_map(em);
6505 em = NULL; 6563 em = NULL;
6506 } 6564 }
@@ -7112,8 +7170,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7112 block_start, len, 7170 block_start, len,
7113 orig_block_len, 7171 orig_block_len,
7114 ram_bytes, type); 7172 ram_bytes, type);
7115 if (IS_ERR(em)) 7173 if (IS_ERR(em)) {
7174 ret = PTR_ERR(em);
7116 goto unlock_err; 7175 goto unlock_err;
7176 }
7117 } 7177 }
7118 7178
7119 ret = btrfs_add_ordered_extent_dio(inode, start, 7179 ret = btrfs_add_ordered_extent_dio(inode, start,
@@ -7188,45 +7248,277 @@ unlock_err:
7188 return ret; 7248 return ret;
7189} 7249}
7190 7250
7191static void btrfs_endio_direct_read(struct bio *bio, int err) 7251static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
7252 int rw, int mirror_num)
7192{ 7253{
7193 struct btrfs_dio_private *dip = bio->bi_private;
7194 struct bio_vec *bvec;
7195 struct inode *inode = dip->inode;
7196 struct btrfs_root *root = BTRFS_I(inode)->root; 7254 struct btrfs_root *root = BTRFS_I(inode)->root;
7197 struct bio *dio_bio; 7255 int ret;
7198 u32 *csums = (u32 *)dip->csum; 7256
7257 BUG_ON(rw & REQ_WRITE);
7258
7259 bio_get(bio);
7260
7261 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7262 BTRFS_WQ_ENDIO_DIO_REPAIR);
7263 if (ret)
7264 goto err;
7265
7266 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
7267err:
7268 bio_put(bio);
7269 return ret;
7270}
7271
7272static int btrfs_check_dio_repairable(struct inode *inode,
7273 struct bio *failed_bio,
7274 struct io_failure_record *failrec,
7275 int failed_mirror)
7276{
7277 int num_copies;
7278
7279 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
7280 failrec->logical, failrec->len);
7281 if (num_copies == 1) {
7282 /*
7283 * we only have a single copy of the data, so don't bother with
7284 * all the retry and error correction code that follows. no
7285 * matter what the error is, it is very likely to persist.
7286 */
7287 pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
7288 num_copies, failrec->this_mirror, failed_mirror);
7289 return 0;
7290 }
7291
7292 failrec->failed_mirror = failed_mirror;
7293 failrec->this_mirror++;
7294 if (failrec->this_mirror == failed_mirror)
7295 failrec->this_mirror++;
7296
7297 if (failrec->this_mirror > num_copies) {
7298 pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
7299 num_copies, failrec->this_mirror, failed_mirror);
7300 return 0;
7301 }
7302
7303 return 1;
7304}
7305
7306static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7307 struct page *page, u64 start, u64 end,
7308 int failed_mirror, bio_end_io_t *repair_endio,
7309 void *repair_arg)
7310{
7311 struct io_failure_record *failrec;
7312 struct bio *bio;
7313 int isector;
7314 int read_mode;
7315 int ret;
7316
7317 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
7318
7319 ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
7320 if (ret)
7321 return ret;
7322
7323 ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
7324 failed_mirror);
7325 if (!ret) {
7326 free_io_failure(inode, failrec);
7327 return -EIO;
7328 }
7329
7330 if (failed_bio->bi_vcnt > 1)
7331 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
7332 else
7333 read_mode = READ_SYNC;
7334
7335 isector = start - btrfs_io_bio(failed_bio)->logical;
7336 isector >>= inode->i_sb->s_blocksize_bits;
7337 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7338 0, isector, repair_endio, repair_arg);
7339 if (!bio) {
7340 free_io_failure(inode, failrec);
7341 return -EIO;
7342 }
7343
7344 btrfs_debug(BTRFS_I(inode)->root->fs_info,
7345 "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
7346 read_mode, failrec->this_mirror, failrec->in_validation);
7347
7348 ret = submit_dio_repair_bio(inode, bio, read_mode,
7349 failrec->this_mirror);
7350 if (ret) {
7351 free_io_failure(inode, failrec);
7352 bio_put(bio);
7353 }
7354
7355 return ret;
7356}
7357
7358struct btrfs_retry_complete {
7359 struct completion done;
7360 struct inode *inode;
7361 u64 start;
7362 int uptodate;
7363};
7364
7365static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
7366{
7367 struct btrfs_retry_complete *done = bio->bi_private;
7368 struct bio_vec *bvec;
7369 int i;
7370
7371 if (err)
7372 goto end;
7373
7374 done->uptodate = 1;
7375 bio_for_each_segment_all(bvec, bio, i)
7376 clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
7377end:
7378 complete(&done->done);
7379 bio_put(bio);
7380}
7381
7382static int __btrfs_correct_data_nocsum(struct inode *inode,
7383 struct btrfs_io_bio *io_bio)
7384{
7385 struct bio_vec *bvec;
7386 struct btrfs_retry_complete done;
7199 u64 start; 7387 u64 start;
7200 int i; 7388 int i;
7389 int ret;
7390
7391 start = io_bio->logical;
7392 done.inode = inode;
7393
7394 bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7395try_again:
7396 done.uptodate = 0;
7397 done.start = start;
7398 init_completion(&done.done);
7399
7400 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
7401 start + bvec->bv_len - 1,
7402 io_bio->mirror_num,
7403 btrfs_retry_endio_nocsum, &done);
7404 if (ret)
7405 return ret;
7406
7407 wait_for_completion(&done.done);
7408
7409 if (!done.uptodate) {
7410 /* We might have another mirror, so try again */
7411 goto try_again;
7412 }
7413
7414 start += bvec->bv_len;
7415 }
7201 7416
7202 start = dip->logical_offset; 7417 return 0;
7418}
7419
7420static void btrfs_retry_endio(struct bio *bio, int err)
7421{
7422 struct btrfs_retry_complete *done = bio->bi_private;
7423 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7424 struct bio_vec *bvec;
7425 int uptodate;
7426 int ret;
7427 int i;
7428
7429 if (err)
7430 goto end;
7431
7432 uptodate = 1;
7203 bio_for_each_segment_all(bvec, bio, i) { 7433 bio_for_each_segment_all(bvec, bio, i) {
7204 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 7434 ret = __readpage_endio_check(done->inode, io_bio, i,
7205 struct page *page = bvec->bv_page; 7435 bvec->bv_page, 0,
7206 char *kaddr; 7436 done->start, bvec->bv_len);
7207 u32 csum = ~(u32)0; 7437 if (!ret)
7208 unsigned long flags; 7438 clean_io_failure(done->inode, done->start,
7209 7439 bvec->bv_page, 0);
7210 local_irq_save(flags); 7440 else
7211 kaddr = kmap_atomic(page); 7441 uptodate = 0;
7212 csum = btrfs_csum_data(kaddr + bvec->bv_offset, 7442 }
7213 csum, bvec->bv_len); 7443
7214 btrfs_csum_final(csum, (char *)&csum); 7444 done->uptodate = uptodate;
7215 kunmap_atomic(kaddr); 7445end:
7216 local_irq_restore(flags); 7446 complete(&done->done);
7217 7447 bio_put(bio);
7218 flush_dcache_page(bvec->bv_page); 7448}
7219 if (csum != csums[i]) { 7449
7220 btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", 7450static int __btrfs_subio_endio_read(struct inode *inode,
7221 btrfs_ino(inode), start, csum, 7451 struct btrfs_io_bio *io_bio, int err)
7222 csums[i]); 7452{
7223 err = -EIO; 7453 struct bio_vec *bvec;
7224 } 7454 struct btrfs_retry_complete done;
7455 u64 start;
7456 u64 offset = 0;
7457 int i;
7458 int ret;
7459
7460 err = 0;
7461 start = io_bio->logical;
7462 done.inode = inode;
7463
7464 bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7465 ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
7466 0, start, bvec->bv_len);
7467 if (likely(!ret))
7468 goto next;
7469try_again:
7470 done.uptodate = 0;
7471 done.start = start;
7472 init_completion(&done.done);
7473
7474 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
7475 start + bvec->bv_len - 1,
7476 io_bio->mirror_num,
7477 btrfs_retry_endio, &done);
7478 if (ret) {
7479 err = ret;
7480 goto next;
7225 } 7481 }
7226 7482
7483 wait_for_completion(&done.done);
7484
7485 if (!done.uptodate) {
7486 /* We might have another mirror, so try again */
7487 goto try_again;
7488 }
7489next:
7490 offset += bvec->bv_len;
7227 start += bvec->bv_len; 7491 start += bvec->bv_len;
7228 } 7492 }
7229 7493
7494 return err;
7495}
7496
7497static int btrfs_subio_endio_read(struct inode *inode,
7498 struct btrfs_io_bio *io_bio, int err)
7499{
7500 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7501
7502 if (skip_csum) {
7503 if (unlikely(err))
7504 return __btrfs_correct_data_nocsum(inode, io_bio);
7505 else
7506 return 0;
7507 } else {
7508 return __btrfs_subio_endio_read(inode, io_bio, err);
7509 }
7510}
7511
7512static void btrfs_endio_direct_read(struct bio *bio, int err)
7513{
7514 struct btrfs_dio_private *dip = bio->bi_private;
7515 struct inode *inode = dip->inode;
7516 struct bio *dio_bio;
7517 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7518
7519 if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
7520 err = btrfs_subio_endio_read(inode, io_bio, err);
7521
7230 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 7522 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
7231 dip->logical_offset + dip->bytes - 1); 7523 dip->logical_offset + dip->bytes - 1);
7232 dio_bio = dip->dio_bio; 7524 dio_bio = dip->dio_bio;
@@ -7237,6 +7529,9 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
7237 if (err) 7529 if (err)
7238 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); 7530 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7239 dio_end_io(dio_bio, err); 7531 dio_end_io(dio_bio, err);
7532
7533 if (io_bio->end_io)
7534 io_bio->end_io(io_bio, err);
7240 bio_put(bio); 7535 bio_put(bio);
7241} 7536}
7242 7537
@@ -7302,12 +7597,17 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
7302{ 7597{
7303 struct btrfs_dio_private *dip = bio->bi_private; 7598 struct btrfs_dio_private *dip = bio->bi_private;
7304 7599
7600 if (err)
7601 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
7602 "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
7603 btrfs_ino(dip->inode), bio->bi_rw,
7604 (unsigned long long)bio->bi_iter.bi_sector,
7605 bio->bi_iter.bi_size, err);
7606
7607 if (dip->subio_endio)
7608 err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
7609
7305 if (err) { 7610 if (err) {
7306 btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
7307 "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
7308 btrfs_ino(dip->inode), bio->bi_rw,
7309 (unsigned long long)bio->bi_iter.bi_sector,
7310 bio->bi_iter.bi_size, err);
7311 dip->errors = 1; 7611 dip->errors = 1;
7312 7612
7313 /* 7613 /*
@@ -7338,6 +7638,38 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
7338 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); 7638 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
7339} 7639}
7340 7640
7641static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
7642 struct inode *inode,
7643 struct btrfs_dio_private *dip,
7644 struct bio *bio,
7645 u64 file_offset)
7646{
7647 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7648 struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
7649 int ret;
7650
7651 /*
7652 * We load all the csum data we need when we submit
7653 * the first bio to reduce the csum tree search and
7654 * contention.
7655 */
7656 if (dip->logical_offset == file_offset) {
7657 ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
7658 file_offset);
7659 if (ret)
7660 return ret;
7661 }
7662
7663 if (bio == dip->orig_bio)
7664 return 0;
7665
7666 file_offset -= dip->logical_offset;
7667 file_offset >>= inode->i_sb->s_blocksize_bits;
7668 io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
7669
7670 return 0;
7671}
7672
7341static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 7673static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7342 int rw, u64 file_offset, int skip_sum, 7674 int rw, u64 file_offset, int skip_sum,
7343 int async_submit) 7675 int async_submit)
@@ -7353,7 +7685,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7353 bio_get(bio); 7685 bio_get(bio);
7354 7686
7355 if (!write) { 7687 if (!write) {
7356 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 7688 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7689 BTRFS_WQ_ENDIO_DATA);
7357 if (ret) 7690 if (ret)
7358 goto err; 7691 goto err;
7359 } 7692 }
@@ -7376,13 +7709,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7376 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); 7709 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
7377 if (ret) 7710 if (ret)
7378 goto err; 7711 goto err;
7379 } else if (!skip_sum) { 7712 } else {
7380 ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio, 7713 ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
7381 file_offset); 7714 file_offset);
7382 if (ret) 7715 if (ret)
7383 goto err; 7716 goto err;
7384 } 7717 }
7385
7386map: 7718map:
7387 ret = btrfs_map_bio(root, rw, bio, 0, async_submit); 7719 ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
7388err: 7720err:
@@ -7403,7 +7735,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7403 u64 submit_len = 0; 7735 u64 submit_len = 0;
7404 u64 map_length; 7736 u64 map_length;
7405 int nr_pages = 0; 7737 int nr_pages = 0;
7406 int ret = 0; 7738 int ret;
7407 int async_submit = 0; 7739 int async_submit = 0;
7408 7740
7409 map_length = orig_bio->bi_iter.bi_size; 7741 map_length = orig_bio->bi_iter.bi_size;
@@ -7414,6 +7746,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7414 7746
7415 if (map_length >= orig_bio->bi_iter.bi_size) { 7747 if (map_length >= orig_bio->bi_iter.bi_size) {
7416 bio = orig_bio; 7748 bio = orig_bio;
7749 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
7417 goto submit; 7750 goto submit;
7418 } 7751 }
7419 7752
@@ -7430,12 +7763,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7430 7763
7431 bio->bi_private = dip; 7764 bio->bi_private = dip;
7432 bio->bi_end_io = btrfs_end_dio_bio; 7765 bio->bi_end_io = btrfs_end_dio_bio;
7766 btrfs_io_bio(bio)->logical = file_offset;
7433 atomic_inc(&dip->pending_bios); 7767 atomic_inc(&dip->pending_bios);
7434 7768
7435 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 7769 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
7436 if (unlikely(map_length < submit_len + bvec->bv_len || 7770 if (map_length < submit_len + bvec->bv_len ||
7437 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 7771 bio_add_page(bio, bvec->bv_page, bvec->bv_len,
7438 bvec->bv_offset) < bvec->bv_len)) { 7772 bvec->bv_offset) < bvec->bv_len) {
7439 /* 7773 /*
7440 * inc the count before we submit the bio so 7774 * inc the count before we submit the bio so
7441 * we know the end IO handler won't happen before 7775 * we know the end IO handler won't happen before
@@ -7464,6 +7798,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7464 goto out_err; 7798 goto out_err;
7465 bio->bi_private = dip; 7799 bio->bi_private = dip;
7466 bio->bi_end_io = btrfs_end_dio_bio; 7800 bio->bi_end_io = btrfs_end_dio_bio;
7801 btrfs_io_bio(bio)->logical = file_offset;
7467 7802
7468 map_length = orig_bio->bi_iter.bi_size; 7803 map_length = orig_bio->bi_iter.bi_size;
7469 ret = btrfs_map_block(root->fs_info, rw, 7804 ret = btrfs_map_block(root->fs_info, rw,
@@ -7507,11 +7842,10 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7507 struct btrfs_root *root = BTRFS_I(inode)->root; 7842 struct btrfs_root *root = BTRFS_I(inode)->root;
7508 struct btrfs_dio_private *dip; 7843 struct btrfs_dio_private *dip;
7509 struct bio *io_bio; 7844 struct bio *io_bio;
7845 struct btrfs_io_bio *btrfs_bio;
7510 int skip_sum; 7846 int skip_sum;
7511 int sum_len;
7512 int write = rw & REQ_WRITE; 7847 int write = rw & REQ_WRITE;
7513 int ret = 0; 7848 int ret = 0;
7514 u16 csum_size;
7515 7849
7516 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 7850 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7517 7851
@@ -7521,16 +7855,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7521 goto free_ordered; 7855 goto free_ordered;
7522 } 7856 }
7523 7857
7524 if (!skip_sum && !write) { 7858 dip = kzalloc(sizeof(*dip), GFP_NOFS);
7525 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
7526 sum_len = dio_bio->bi_iter.bi_size >>
7527 inode->i_sb->s_blocksize_bits;
7528 sum_len *= csum_size;
7529 } else {
7530 sum_len = 0;
7531 }
7532
7533 dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
7534 if (!dip) { 7859 if (!dip) {
7535 ret = -ENOMEM; 7860 ret = -ENOMEM;
7536 goto free_io_bio; 7861 goto free_io_bio;
@@ -7542,20 +7867,25 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7542 dip->bytes = dio_bio->bi_iter.bi_size; 7867 dip->bytes = dio_bio->bi_iter.bi_size;
7543 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; 7868 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
7544 io_bio->bi_private = dip; 7869 io_bio->bi_private = dip;
7545 dip->errors = 0;
7546 dip->orig_bio = io_bio; 7870 dip->orig_bio = io_bio;
7547 dip->dio_bio = dio_bio; 7871 dip->dio_bio = dio_bio;
7548 atomic_set(&dip->pending_bios, 0); 7872 atomic_set(&dip->pending_bios, 0);
7873 btrfs_bio = btrfs_io_bio(io_bio);
7874 btrfs_bio->logical = file_offset;
7549 7875
7550 if (write) 7876 if (write) {
7551 io_bio->bi_end_io = btrfs_endio_direct_write; 7877 io_bio->bi_end_io = btrfs_endio_direct_write;
7552 else 7878 } else {
7553 io_bio->bi_end_io = btrfs_endio_direct_read; 7879 io_bio->bi_end_io = btrfs_endio_direct_read;
7880 dip->subio_endio = btrfs_subio_endio_read;
7881 }
7554 7882
7555 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 7883 ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
7556 if (!ret) 7884 if (!ret)
7557 return; 7885 return;
7558 7886
7887 if (btrfs_bio->end_io)
7888 btrfs_bio->end_io(btrfs_bio, ret);
7559free_io_bio: 7889free_io_bio:
7560 bio_put(io_bio); 7890 bio_put(io_bio);
7561 7891
@@ -7652,8 +7982,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7652 ret = btrfs_delalloc_reserve_space(inode, count); 7982 ret = btrfs_delalloc_reserve_space(inode, count);
7653 if (ret) 7983 if (ret)
7654 goto out; 7984 goto out;
7655 } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, 7985 } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
7656 &BTRFS_I(inode)->runtime_flags))) { 7986 &BTRFS_I(inode)->runtime_flags)) {
7657 inode_dio_done(inode); 7987 inode_dio_done(inode);
7658 flags = DIO_LOCKING | DIO_SKIP_HOLES; 7988 flags = DIO_LOCKING | DIO_SKIP_HOLES;
7659 wakeup = false; 7989 wakeup = false;
@@ -8173,6 +8503,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
8173 ei->last_sub_trans = 0; 8503 ei->last_sub_trans = 0;
8174 ei->logged_trans = 0; 8504 ei->logged_trans = 0;
8175 ei->delalloc_bytes = 0; 8505 ei->delalloc_bytes = 0;
8506 ei->defrag_bytes = 0;
8176 ei->disk_i_size = 0; 8507 ei->disk_i_size = 0;
8177 ei->flags = 0; 8508 ei->flags = 0;
8178 ei->csum_bytes = 0; 8509 ei->csum_bytes = 0;
@@ -8231,6 +8562,7 @@ void btrfs_destroy_inode(struct inode *inode)
8231 WARN_ON(BTRFS_I(inode)->reserved_extents); 8562 WARN_ON(BTRFS_I(inode)->reserved_extents);
8232 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 8563 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
8233 WARN_ON(BTRFS_I(inode)->csum_bytes); 8564 WARN_ON(BTRFS_I(inode)->csum_bytes);
8565 WARN_ON(BTRFS_I(inode)->defrag_bytes);
8234 8566
8235 /* 8567 /*
8236 * This can happen where we create an inode, but somebody else also 8568 * This can happen where we create an inode, but somebody else also
@@ -8646,7 +8978,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
8646 spin_unlock(&root->delalloc_lock); 8978 spin_unlock(&root->delalloc_lock);
8647 8979
8648 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); 8980 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8649 if (unlikely(!work)) { 8981 if (!work) {
8650 if (delay_iput) 8982 if (delay_iput)
8651 btrfs_add_delayed_iput(inode); 8983 btrfs_add_delayed_iput(inode);
8652 else 8984 else
@@ -8832,7 +9164,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8832 } 9164 }
8833 key.objectid = btrfs_ino(inode); 9165 key.objectid = btrfs_ino(inode);
8834 key.offset = 0; 9166 key.offset = 0;
8835 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 9167 key.type = BTRFS_EXTENT_DATA_KEY;
8836 datasize = btrfs_file_extent_calc_inline_size(name_len); 9168 datasize = btrfs_file_extent_calc_inline_size(name_len);
8837 err = btrfs_insert_empty_item(trans, root, path, &key, 9169 err = btrfs_insert_empty_item(trans, root, path, &key,
8838 datasize); 9170 datasize);