aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-07-05 19:41:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-07-05 19:41:23 -0400
commit8c27cb3566762613a23c080e3db7d0501af9a787 (patch)
tree32b2752e320b6cb3ecf289dd00b5145a6de947e6
parent7114f51fcb979f167ab5f625ac74059dcb1afc28 (diff)
parent848c23b78fafdcd3270b06a30737f8dbd70c347f (diff)
Merge branch 'for-4.13-part1' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "The core updates improve error handling (mostly related to bios), with the usual incremental work on the GFP_NOFS (mis)use removal, refactoring or cleanups. Except the two top patches, all have been in for-next for an extensive amount of time. User visible changes: - statx support - quota override tunable - improved compression thresholds - obsoleted mount option alloc_start Core updates: - bio-related updates: - faster bio cloning - no allocation failures - preallocated flush bios - more kvzalloc use, memalloc_nofs protections, GFP_NOFS updates - prep work for btree_inode removal - dir-item validation - qgoup fixes and updates - cleanups: - removed unused struct members, unused code, refactoring - argument refactoring (fs_info/root, caller -> callee sink) - SEARCH_TREE ioctl docs" * 'for-4.13-part1' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (115 commits) btrfs: Remove false alert when fiemap range is smaller than on-disk extent btrfs: Don't clear SGID when inheriting ACLs btrfs: fix integer overflow in calc_reclaim_items_nr btrfs: scrub: fix target device intialization while setting up scrub context btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges btrfs: qgroup: Introduce extent changeset for qgroup reserve functions btrfs: qgroup: Fix qgroup reserved space underflow caused by buffered write and quotas being enabled btrfs: qgroup: Return actually freed bytes for qgroup release or free data btrfs: qgroup: Cleanup btrfs_qgroup_prepare_account_extents function btrfs: qgroup: Add quick exit for non-fs extents Btrfs: rework delayed ref total_bytes_pinned accounting Btrfs: return old and new total ref mods when adding delayed refs Btrfs: always account pinned bytes when dropping a tree block ref Btrfs: update total_bytes_pinned when pinning down extents Btrfs: make BUG_ON() in add_pinned_bytes() an ASSERT() Btrfs: make add_pinned_bytes() take an s64 num_bytes instead of u64 btrfs: fix validation of XATTR_ITEM dir items btrfs: Verify dir_item in iterate_object_props btrfs: Check name_len before in btrfs_del_root_ref btrfs: Check name_len before reading btrfs_get_name ...
-rw-r--r--fs/btrfs/acl.c13
-rw-r--r--fs/btrfs/backref.c10
-rw-r--r--fs/btrfs/check-integrity.c53
-rw-r--r--fs/btrfs/compression.c94
-rw-r--r--fs/btrfs/compression.h44
-rw-r--r--fs/btrfs/ctree.c42
-rw-r--r--fs/btrfs/ctree.h84
-rw-r--r--fs/btrfs/delayed-ref.c29
-rw-r--r--fs/btrfs/delayed-ref.h6
-rw-r--r--fs/btrfs/dev-replace.c4
-rw-r--r--fs/btrfs/dir-item.c76
-rw-r--r--fs/btrfs/disk-io.c194
-rw-r--r--fs/btrfs/disk-io.h12
-rw-r--r--fs/btrfs/export.c5
-rw-r--r--fs/btrfs/extent-tree.c493
-rw-r--r--fs/btrfs/extent_io.c245
-rw-r--r--fs/btrfs/extent_io.h86
-rw-r--r--fs/btrfs/file-item.c31
-rw-r--r--fs/btrfs/file.c46
-rw-r--r--fs/btrfs/free-space-tree.c38
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c457
-rw-r--r--fs/btrfs/ioctl.c18
-rw-r--r--fs/btrfs/lzo.c33
-rw-r--r--fs/btrfs/ordered-data.c17
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/print-tree.c7
-rw-r--r--fs/btrfs/props.c7
-rw-r--r--fs/btrfs/qgroup.c225
-rw-r--r--fs/btrfs/qgroup.h9
-rw-r--r--fs/btrfs/raid56.c16
-rw-r--r--fs/btrfs/reada.c1
-rw-r--r--fs/btrfs/relocation.c17
-rw-r--r--fs/btrfs/root-tree.c7
-rw-r--r--fs/btrfs/scrub.c211
-rw-r--r--fs/btrfs/send.c112
-rw-r--r--fs/btrfs/super.c74
-rw-r--r--fs/btrfs/sysfs.c41
-rw-r--r--fs/btrfs/tests/extent-io-tests.c2
-rw-r--r--fs/btrfs/transaction.c25
-rw-r--r--fs/btrfs/tree-log.c44
-rw-r--r--fs/btrfs/volumes.c74
-rw-r--r--fs/btrfs/volumes.h7
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/zlib.c20
-rw-r--r--include/trace/events/btrfs.h36
-rw-r--r--include/uapi/linux/btrfs.h63
47 files changed, 1723 insertions, 1415 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 247b8dfaf6e5..8d8370ddb6b2 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -78,12 +78,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
78 switch (type) { 78 switch (type) {
79 case ACL_TYPE_ACCESS: 79 case ACL_TYPE_ACCESS:
80 name = XATTR_NAME_POSIX_ACL_ACCESS; 80 name = XATTR_NAME_POSIX_ACL_ACCESS;
81 if (acl) {
82 ret = posix_acl_update_mode(inode, &inode->i_mode, &acl);
83 if (ret)
84 return ret;
85 }
86 ret = 0;
87 break; 81 break;
88 case ACL_TYPE_DEFAULT: 82 case ACL_TYPE_DEFAULT:
89 if (!S_ISDIR(inode->i_mode)) 83 if (!S_ISDIR(inode->i_mode))
@@ -119,6 +113,13 @@ out:
119 113
120int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) 114int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
121{ 115{
116 int ret;
117
118 if (type == ACL_TYPE_ACCESS && acl) {
119 ret = posix_acl_update_mode(inode, &inode->i_mode, &acl);
120 if (ret)
121 return ret;
122 }
122 return __btrfs_set_acl(NULL, inode, acl, type); 123 return __btrfs_set_acl(NULL, inode, acl, type);
123} 124}
124 125
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 24865da63d8f..f723c11bb763 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,7 +16,7 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/vmalloc.h> 19#include <linux/mm.h>
20#include <linux/rbtree.h> 20#include <linux/rbtree.h>
21#include "ctree.h" 21#include "ctree.h"
22#include "disk-io.h" 22#include "disk-io.h"
@@ -2305,7 +2305,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
2305 size_t alloc_bytes; 2305 size_t alloc_bytes;
2306 2306
2307 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); 2307 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
2308 data = vmalloc(alloc_bytes); 2308 data = kvmalloc(alloc_bytes, GFP_KERNEL);
2309 if (!data) 2309 if (!data)
2310 return ERR_PTR(-ENOMEM); 2310 return ERR_PTR(-ENOMEM);
2311 2311
@@ -2339,9 +2339,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
2339 if (IS_ERR(fspath)) 2339 if (IS_ERR(fspath))
2340 return (void *)fspath; 2340 return (void *)fspath;
2341 2341
2342 ifp = kmalloc(sizeof(*ifp), GFP_NOFS); 2342 ifp = kmalloc(sizeof(*ifp), GFP_KERNEL);
2343 if (!ifp) { 2343 if (!ifp) {
2344 vfree(fspath); 2344 kvfree(fspath);
2345 return ERR_PTR(-ENOMEM); 2345 return ERR_PTR(-ENOMEM);
2346 } 2346 }
2347 2347
@@ -2356,6 +2356,6 @@ void free_ipath(struct inode_fs_paths *ipath)
2356{ 2356{
2357 if (!ipath) 2357 if (!ipath)
2358 return; 2358 return;
2359 vfree(ipath->fspath); 2359 kvfree(ipath->fspath);
2360 kfree(ipath); 2360 kfree(ipath);
2361} 2361}
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 4ded1c3f92b8..11d37c94ce05 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -94,7 +94,7 @@
94#include <linux/mutex.h> 94#include <linux/mutex.h>
95#include <linux/genhd.h> 95#include <linux/genhd.h>
96#include <linux/blkdev.h> 96#include <linux/blkdev.h>
97#include <linux/vmalloc.h> 97#include <linux/mm.h>
98#include <linux/string.h> 98#include <linux/string.h>
99#include "ctree.h" 99#include "ctree.h"
100#include "disk-io.h" 100#include "disk-io.h"
@@ -1638,12 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
1638 struct bio *bio; 1638 struct bio *bio;
1639 unsigned int j; 1639 unsigned int j;
1640 1640
1641 bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); 1641 bio = btrfs_io_bio_alloc(num_pages - i);
1642 if (!bio) {
1643 pr_info("btrfsic: bio_alloc() for %u pages failed!\n",
1644 num_pages - i);
1645 return -1;
1646 }
1647 bio->bi_bdev = block_ctx->dev->bdev; 1642 bio->bi_bdev = block_ctx->dev->bdev;
1648 bio->bi_iter.bi_sector = dev_bytenr >> 9; 1643 bio->bi_iter.bi_sector = dev_bytenr >> 9;
1649 bio_set_op_attrs(bio, REQ_OP_READ, 0); 1644 bio_set_op_attrs(bio, REQ_OP_READ, 0);
@@ -1668,14 +1663,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
1668 dev_bytenr += (j - i) * PAGE_SIZE; 1663 dev_bytenr += (j - i) * PAGE_SIZE;
1669 i = j; 1664 i = j;
1670 } 1665 }
1671 for (i = 0; i < num_pages; i++) { 1666 for (i = 0; i < num_pages; i++)
1672 block_ctx->datav[i] = kmap(block_ctx->pagev[i]); 1667 block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
1673 if (!block_ctx->datav[i]) {
1674 pr_info("btrfsic: kmap() failed (dev %s)!\n",
1675 block_ctx->dev->name);
1676 return -1;
1677 }
1678 }
1679 1668
1680 return block_ctx->len; 1669 return block_ctx->len;
1681} 1670}
@@ -2822,44 +2811,47 @@ static void __btrfsic_submit_bio(struct bio *bio)
2822 dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); 2811 dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
2823 if (NULL != dev_state && 2812 if (NULL != dev_state &&
2824 (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { 2813 (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
2825 unsigned int i; 2814 unsigned int i = 0;
2826 u64 dev_bytenr; 2815 u64 dev_bytenr;
2827 u64 cur_bytenr; 2816 u64 cur_bytenr;
2828 struct bio_vec *bvec; 2817 struct bio_vec bvec;
2818 struct bvec_iter iter;
2829 int bio_is_patched; 2819 int bio_is_patched;
2830 char **mapped_datav; 2820 char **mapped_datav;
2821 unsigned int segs = bio_segments(bio);
2831 2822
2832 dev_bytenr = 512 * bio->bi_iter.bi_sector; 2823 dev_bytenr = 512 * bio->bi_iter.bi_sector;
2833 bio_is_patched = 0; 2824 bio_is_patched = 0;
2834 if (dev_state->state->print_mask & 2825 if (dev_state->state->print_mask &
2835 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) 2826 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2836 pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", 2827 pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
2837 bio_op(bio), bio->bi_opf, bio->bi_vcnt, 2828 bio_op(bio), bio->bi_opf, segs,
2838 (unsigned long long)bio->bi_iter.bi_sector, 2829 (unsigned long long)bio->bi_iter.bi_sector,
2839 dev_bytenr, bio->bi_bdev); 2830 dev_bytenr, bio->bi_bdev);
2840 2831
2841 mapped_datav = kmalloc_array(bio->bi_vcnt, 2832 mapped_datav = kmalloc_array(segs,
2842 sizeof(*mapped_datav), GFP_NOFS); 2833 sizeof(*mapped_datav), GFP_NOFS);
2843 if (!mapped_datav) 2834 if (!mapped_datav)
2844 goto leave; 2835 goto leave;
2845 cur_bytenr = dev_bytenr; 2836 cur_bytenr = dev_bytenr;
2846 2837
2847 bio_for_each_segment_all(bvec, bio, i) { 2838 bio_for_each_segment(bvec, bio, iter) {
2848 BUG_ON(bvec->bv_len != PAGE_SIZE); 2839 BUG_ON(bvec.bv_len != PAGE_SIZE);
2849 mapped_datav[i] = kmap(bvec->bv_page); 2840 mapped_datav[i] = kmap(bvec.bv_page);
2841 i++;
2850 2842
2851 if (dev_state->state->print_mask & 2843 if (dev_state->state->print_mask &
2852 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) 2844 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
2853 pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", 2845 pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
2854 i, cur_bytenr, bvec->bv_len, bvec->bv_offset); 2846 i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
2855 cur_bytenr += bvec->bv_len; 2847 cur_bytenr += bvec.bv_len;
2856 } 2848 }
2857 btrfsic_process_written_block(dev_state, dev_bytenr, 2849 btrfsic_process_written_block(dev_state, dev_bytenr,
2858 mapped_datav, bio->bi_vcnt, 2850 mapped_datav, segs,
2859 bio, &bio_is_patched, 2851 bio, &bio_is_patched,
2860 NULL, bio->bi_opf); 2852 NULL, bio->bi_opf);
2861 bio_for_each_segment_all(bvec, bio, i) 2853 bio_for_each_segment(bvec, bio, iter)
2862 kunmap(bvec->bv_page); 2854 kunmap(bvec.bv_page);
2863 kfree(mapped_datav); 2855 kfree(mapped_datav);
2864 } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { 2856 } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
2865 if (dev_state->state->print_mask & 2857 if (dev_state->state->print_mask &
@@ -2923,13 +2915,10 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
2923 fs_info->sectorsize, PAGE_SIZE); 2915 fs_info->sectorsize, PAGE_SIZE);
2924 return -1; 2916 return -1;
2925 } 2917 }
2926 state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 2918 state = kvzalloc(sizeof(*state), GFP_KERNEL);
2927 if (!state) { 2919 if (!state) {
2928 state = vzalloc(sizeof(*state)); 2920 pr_info("btrfs check-integrity: allocation failed!\n");
2929 if (!state) { 2921 return -1;
2930 pr_info("btrfs check-integrity: vzalloc() failed!\n");
2931 return -1;
2932 }
2933 } 2922 }
2934 2923
2935 if (!btrfsic_is_initialized) { 2924 if (!btrfsic_is_initialized) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a2fad39f79ba..2c0b7b57fcd5 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,6 +32,7 @@
32#include <linux/writeback.h> 32#include <linux/writeback.h>
33#include <linux/bit_spinlock.h> 33#include <linux/bit_spinlock.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/sched/mm.h>
35#include "ctree.h" 36#include "ctree.h"
36#include "disk-io.h" 37#include "disk-io.h"
37#include "transaction.h" 38#include "transaction.h"
@@ -42,48 +43,7 @@
42#include "extent_io.h" 43#include "extent_io.h"
43#include "extent_map.h" 44#include "extent_map.h"
44 45
45struct compressed_bio { 46static int btrfs_decompress_bio(struct compressed_bio *cb);
46 /* number of bios pending for this compressed extent */
47 refcount_t pending_bios;
48
49 /* the pages with the compressed data on them */
50 struct page **compressed_pages;
51
52 /* inode that owns this data */
53 struct inode *inode;
54
55 /* starting offset in the inode for our pages */
56 u64 start;
57
58 /* number of bytes in the inode we're working on */
59 unsigned long len;
60
61 /* number of bytes on disk */
62 unsigned long compressed_len;
63
64 /* the compression algorithm for this bio */
65 int compress_type;
66
67 /* number of compressed pages in the array */
68 unsigned long nr_pages;
69
70 /* IO errors */
71 int errors;
72 int mirror_num;
73
74 /* for reads, this is the bio we are copying the data into */
75 struct bio *orig_bio;
76
77 /*
78 * the start of a variable length array of checksums only
79 * used by reads
80 */
81 u32 sums;
82};
83
84static int btrfs_decompress_bio(int type, struct page **pages_in,
85 u64 disk_start, struct bio *orig_bio,
86 size_t srclen);
87 47
88static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, 48static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
89 unsigned long disk_size) 49 unsigned long disk_size)
@@ -94,12 +54,6 @@ static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
94 (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size; 54 (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size;
95} 55}
96 56
97static struct bio *compressed_bio_alloc(struct block_device *bdev,
98 u64 first_byte, gfp_t gfp_flags)
99{
100 return btrfs_bio_alloc(bdev, first_byte >> 9, BIO_MAX_PAGES, gfp_flags);
101}
102
103static int check_compressed_csum(struct btrfs_inode *inode, 57static int check_compressed_csum(struct btrfs_inode *inode,
104 struct compressed_bio *cb, 58 struct compressed_bio *cb,
105 u64 disk_start) 59 u64 disk_start)
@@ -173,11 +127,8 @@ static void end_compressed_bio_read(struct bio *bio)
173 /* ok, we're the last bio for this extent, lets start 127 /* ok, we're the last bio for this extent, lets start
174 * the decompression. 128 * the decompression.
175 */ 129 */
176 ret = btrfs_decompress_bio(cb->compress_type, 130 ret = btrfs_decompress_bio(cb);
177 cb->compressed_pages, 131
178 cb->start,
179 cb->orig_bio,
180 cb->compressed_len);
181csum_failed: 132csum_failed:
182 if (ret) 133 if (ret)
183 cb->errors = 1; 134 cb->errors = 1;
@@ -355,11 +306,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
355 306
356 bdev = fs_info->fs_devices->latest_bdev; 307 bdev = fs_info->fs_devices->latest_bdev;
357 308
358 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); 309 bio = btrfs_bio_alloc(bdev, first_byte);
359 if (!bio) {
360 kfree(cb);
361 return BLK_STS_RESOURCE;
362 }
363 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 310 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
364 bio->bi_private = cb; 311 bio->bi_private = cb;
365 bio->bi_end_io = end_compressed_bio_write; 312 bio->bi_end_io = end_compressed_bio_write;
@@ -406,8 +353,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
406 353
407 bio_put(bio); 354 bio_put(bio);
408 355
409 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); 356 bio = btrfs_bio_alloc(bdev, first_byte);
410 BUG_ON(!bio);
411 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 357 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
412 bio->bi_private = cb; 358 bio->bi_private = cb;
413 bio->bi_end_io = end_compressed_bio_write; 359 bio->bi_end_io = end_compressed_bio_write;
@@ -650,9 +596,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
650 /* include any pages we added in add_ra-bio_pages */ 596 /* include any pages we added in add_ra-bio_pages */
651 cb->len = bio->bi_iter.bi_size; 597 cb->len = bio->bi_iter.bi_size;
652 598
653 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); 599 comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
654 if (!comp_bio)
655 goto fail2;
656 bio_set_op_attrs (comp_bio, REQ_OP_READ, 0); 600 bio_set_op_attrs (comp_bio, REQ_OP_READ, 0);
657 comp_bio->bi_private = cb; 601 comp_bio->bi_private = cb;
658 comp_bio->bi_end_io = end_compressed_bio_read; 602 comp_bio->bi_end_io = end_compressed_bio_read;
@@ -703,9 +647,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
703 647
704 bio_put(comp_bio); 648 bio_put(comp_bio);
705 649
706 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, 650 comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
707 GFP_NOFS);
708 BUG_ON(!comp_bio);
709 bio_set_op_attrs(comp_bio, REQ_OP_READ, 0); 651 bio_set_op_attrs(comp_bio, REQ_OP_READ, 0);
710 comp_bio->bi_private = cb; 652 comp_bio->bi_private = cb;
711 comp_bio->bi_end_io = end_compressed_bio_read; 653 comp_bio->bi_end_io = end_compressed_bio_read;
@@ -801,6 +743,7 @@ static struct list_head *find_workspace(int type)
801 struct list_head *workspace; 743 struct list_head *workspace;
802 int cpus = num_online_cpus(); 744 int cpus = num_online_cpus();
803 int idx = type - 1; 745 int idx = type - 1;
746 unsigned nofs_flag;
804 747
805 struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; 748 struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
806 spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; 749 spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
@@ -830,7 +773,15 @@ again:
830 atomic_inc(total_ws); 773 atomic_inc(total_ws);
831 spin_unlock(ws_lock); 774 spin_unlock(ws_lock);
832 775
776 /*
777 * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have
778 * to turn it off here because we might get called from the restricted
779 * context of btrfs_compress_bio/btrfs_compress_pages
780 */
781 nofs_flag = memalloc_nofs_save();
833 workspace = btrfs_compress_op[idx]->alloc_workspace(); 782 workspace = btrfs_compress_op[idx]->alloc_workspace();
783 memalloc_nofs_restore(nofs_flag);
784
834 if (IS_ERR(workspace)) { 785 if (IS_ERR(workspace)) {
835 atomic_dec(total_ws); 786 atomic_dec(total_ws);
836 wake_up(ws_wait); 787 wake_up(ws_wait);
@@ -961,19 +912,16 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
961 * be contiguous. They all correspond to the range of bytes covered by 912 * be contiguous. They all correspond to the range of bytes covered by
962 * the compressed extent. 913 * the compressed extent.
963 */ 914 */
964static int btrfs_decompress_bio(int type, struct page **pages_in, 915static int btrfs_decompress_bio(struct compressed_bio *cb)
965 u64 disk_start, struct bio *orig_bio,
966 size_t srclen)
967{ 916{
968 struct list_head *workspace; 917 struct list_head *workspace;
969 int ret; 918 int ret;
919 int type = cb->compress_type;
970 920
971 workspace = find_workspace(type); 921 workspace = find_workspace(type);
972 922 ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb);
973 ret = btrfs_compress_op[type-1]->decompress_bio(workspace, pages_in,
974 disk_start, orig_bio,
975 srclen);
976 free_workspace(type, workspace); 923 free_workspace(type, workspace);
924
977 return ret; 925 return ret;
978} 926}
979 927
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 680d4265d601..87f6d3332163 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -34,6 +34,45 @@
34/* Maximum size of data before compression */ 34/* Maximum size of data before compression */
35#define BTRFS_MAX_UNCOMPRESSED (SZ_128K) 35#define BTRFS_MAX_UNCOMPRESSED (SZ_128K)
36 36
37struct compressed_bio {
38 /* number of bios pending for this compressed extent */
39 refcount_t pending_bios;
40
41 /* the pages with the compressed data on them */
42 struct page **compressed_pages;
43
44 /* inode that owns this data */
45 struct inode *inode;
46
47 /* starting offset in the inode for our pages */
48 u64 start;
49
50 /* number of bytes in the inode we're working on */
51 unsigned long len;
52
53 /* number of bytes on disk */
54 unsigned long compressed_len;
55
56 /* the compression algorithm for this bio */
57 int compress_type;
58
59 /* number of compressed pages in the array */
60 unsigned long nr_pages;
61
62 /* IO errors */
63 int errors;
64 int mirror_num;
65
66 /* for reads, this is the bio we are copying the data into */
67 struct bio *orig_bio;
68
69 /*
70 * the start of a variable length array of checksums only
71 * used by reads
72 */
73 u32 sums;
74};
75
37void btrfs_init_compress(void); 76void btrfs_init_compress(void);
38void btrfs_exit_compress(void); 77void btrfs_exit_compress(void);
39 78
@@ -78,10 +117,7 @@ struct btrfs_compress_op {
78 unsigned long *total_out); 117 unsigned long *total_out);
79 118
80 int (*decompress_bio)(struct list_head *workspace, 119 int (*decompress_bio)(struct list_head *workspace,
81 struct page **pages_in, 120 struct compressed_bio *cb);
82 u64 disk_start,
83 struct bio *orig_bio,
84 size_t srclen);
85 121
86 int (*decompress)(struct list_head *workspace, 122 int (*decompress)(struct list_head *workspace,
87 unsigned char *data_in, 123 unsigned char *data_in,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a3a75f1de002..3f4daa9d6e2c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -19,7 +19,7 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/rbtree.h> 21#include <linux/rbtree.h>
22#include <linux/vmalloc.h> 22#include <linux/mm.h>
23#include "ctree.h" 23#include "ctree.h"
24#include "disk-io.h" 24#include "disk-io.h"
25#include "transaction.h" 25#include "transaction.h"
@@ -3667,14 +3667,14 @@ static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3667 /* make room in the right data area */ 3667 /* make room in the right data area */
3668 data_end = leaf_data_end(fs_info, right); 3668 data_end = leaf_data_end(fs_info, right);
3669 memmove_extent_buffer(right, 3669 memmove_extent_buffer(right,
3670 btrfs_leaf_data(right) + data_end - push_space, 3670 BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3671 btrfs_leaf_data(right) + data_end, 3671 BTRFS_LEAF_DATA_OFFSET + data_end,
3672 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); 3672 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3673 3673
3674 /* copy from the left data area */ 3674 /* copy from the left data area */
3675 copy_extent_buffer(right, left, btrfs_leaf_data(right) + 3675 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3676 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3676 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3677 btrfs_leaf_data(left) + leaf_data_end(fs_info, left), 3677 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3678 push_space); 3678 push_space);
3679 3679
3680 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), 3680 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
@@ -3888,9 +3888,9 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3888 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - 3888 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3889 btrfs_item_offset_nr(right, push_items - 1); 3889 btrfs_item_offset_nr(right, push_items - 1);
3890 3890
3891 copy_extent_buffer(left, right, btrfs_leaf_data(left) + 3891 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3892 leaf_data_end(fs_info, left) - push_space, 3892 leaf_data_end(fs_info, left) - push_space,
3893 btrfs_leaf_data(right) + 3893 BTRFS_LEAF_DATA_OFFSET +
3894 btrfs_item_offset_nr(right, push_items - 1), 3894 btrfs_item_offset_nr(right, push_items - 1),
3895 push_space); 3895 push_space);
3896 old_left_nritems = btrfs_header_nritems(left); 3896 old_left_nritems = btrfs_header_nritems(left);
@@ -3917,9 +3917,9 @@ static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3917 if (push_items < right_nritems) { 3917 if (push_items < right_nritems) {
3918 push_space = btrfs_item_offset_nr(right, push_items - 1) - 3918 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3919 leaf_data_end(fs_info, right); 3919 leaf_data_end(fs_info, right);
3920 memmove_extent_buffer(right, btrfs_leaf_data(right) + 3920 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3921 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, 3921 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3922 btrfs_leaf_data(right) + 3922 BTRFS_LEAF_DATA_OFFSET +
3923 leaf_data_end(fs_info, right), push_space); 3923 leaf_data_end(fs_info, right), push_space);
3924 3924
3925 memmove_extent_buffer(right, btrfs_item_nr_offset(0), 3925 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
@@ -4069,8 +4069,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4069 nritems * sizeof(struct btrfs_item)); 4069 nritems * sizeof(struct btrfs_item));
4070 4070
4071 copy_extent_buffer(right, l, 4071 copy_extent_buffer(right, l,
4072 btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(fs_info) - 4072 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4073 data_copy_size, btrfs_leaf_data(l) + 4073 data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4074 leaf_data_end(fs_info, l), data_copy_size); 4074 leaf_data_end(fs_info, l), data_copy_size);
4075 4075
4076 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); 4076 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
@@ -4607,8 +4607,8 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4607 4607
4608 /* shift the data */ 4608 /* shift the data */
4609 if (from_end) { 4609 if (from_end) {
4610 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4610 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4611 data_end + size_diff, btrfs_leaf_data(leaf) + 4611 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4612 data_end, old_data_start + new_size - data_end); 4612 data_end, old_data_start + new_size - data_end);
4613 } else { 4613 } else {
4614 struct btrfs_disk_key disk_key; 4614 struct btrfs_disk_key disk_key;
@@ -4634,8 +4634,8 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4634 } 4634 }
4635 } 4635 }
4636 4636
4637 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4637 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4638 data_end + size_diff, btrfs_leaf_data(leaf) + 4638 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4639 data_end, old_data_start - data_end); 4639 data_end, old_data_start - data_end);
4640 4640
4641 offset = btrfs_disk_key_offset(&disk_key); 4641 offset = btrfs_disk_key_offset(&disk_key);
@@ -4707,8 +4707,8 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4707 } 4707 }
4708 4708
4709 /* shift the data */ 4709 /* shift the data */
4710 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4710 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4711 data_end - data_size, btrfs_leaf_data(leaf) + 4711 data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4712 data_end, old_data - data_end); 4712 data_end, old_data - data_end);
4713 4713
4714 data_end = old_data; 4714 data_end = old_data;
@@ -4790,8 +4790,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4790 (nritems - slot) * sizeof(struct btrfs_item)); 4790 (nritems - slot) * sizeof(struct btrfs_item));
4791 4791
4792 /* shift the data */ 4792 /* shift the data */
4793 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4793 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4794 data_end - total_data, btrfs_leaf_data(leaf) + 4794 data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4795 data_end, old_data - data_end); 4795 data_end, old_data - data_end);
4796 data_end = old_data; 4796 data_end = old_data;
4797 } 4797 }
@@ -4983,9 +4983,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4983 if (slot + nr != nritems) { 4983 if (slot + nr != nritems) {
4984 int data_end = leaf_data_end(fs_info, leaf); 4984 int data_end = leaf_data_end(fs_info, leaf);
4985 4985
4986 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 4986 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4987 data_end + dsize, 4987 data_end + dsize,
4988 btrfs_leaf_data(leaf) + data_end, 4988 BTRFS_LEAF_DATA_OFFSET + data_end,
4989 last_off - data_end); 4989 last_off - data_end);
4990 4990
4991 for (i = slot + nr; i < nritems; i++) { 4991 for (i = slot + nr; i < nritems; i++) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a0d0c79d95ed..3f3eb7b17cac 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,6 @@ struct btrfs_trans_handle;
48struct btrfs_transaction; 48struct btrfs_transaction;
49struct btrfs_pending_snapshot; 49struct btrfs_pending_snapshot;
50extern struct kmem_cache *btrfs_trans_handle_cachep; 50extern struct kmem_cache *btrfs_trans_handle_cachep;
51extern struct kmem_cache *btrfs_transaction_cachep;
52extern struct kmem_cache *btrfs_bit_radix_cachep; 51extern struct kmem_cache *btrfs_bit_radix_cachep;
53extern struct kmem_cache *btrfs_path_cachep; 52extern struct kmem_cache *btrfs_path_cachep;
54extern struct kmem_cache *btrfs_free_space_cachep; 53extern struct kmem_cache *btrfs_free_space_cachep;
@@ -716,6 +715,10 @@ struct btrfs_delayed_root;
716#define BTRFS_FS_BTREE_ERR 11 715#define BTRFS_FS_BTREE_ERR 11
717#define BTRFS_FS_LOG1_ERR 12 716#define BTRFS_FS_LOG1_ERR 12
718#define BTRFS_FS_LOG2_ERR 13 717#define BTRFS_FS_LOG2_ERR 13
718#define BTRFS_FS_QUOTA_OVERRIDE 14
719/* Used to record internally whether fs has been frozen */
720#define BTRFS_FS_FROZEN 15
721
719/* 722/*
720 * Indicate that a whole-filesystem exclusive operation is running 723 * Indicate that a whole-filesystem exclusive operation is running
721 * (device replace, resize, device add/delete, balance) 724 * (device replace, resize, device add/delete, balance)
@@ -748,8 +751,7 @@ struct btrfs_fs_info {
748 struct rb_root block_group_cache_tree; 751 struct rb_root block_group_cache_tree;
749 752
750 /* keep track of unallocated space */ 753 /* keep track of unallocated space */
751 spinlock_t free_chunk_lock; 754 atomic64_t free_chunk_space;
752 u64 free_chunk_space;
753 755
754 struct extent_io_tree freed_extents[2]; 756 struct extent_io_tree freed_extents[2];
755 struct extent_io_tree *pinned_extents; 757 struct extent_io_tree *pinned_extents;
@@ -797,17 +799,7 @@ struct btrfs_fs_info {
797 * so it is also safe. 799 * so it is also safe.
798 */ 800 */
799 u64 max_inline; 801 u64 max_inline;
800 /* 802
801 * Protected by ->chunk_mutex and sb->s_umount.
802 *
803 * The reason that we use two lock to protect it is because only
804 * remount and mount operations can change it and these two operations
805 * are under sb->s_umount, but the read side (chunk allocation) can not
806 * acquire sb->s_umount or the deadlock would happen. So we use two
807 * locks to protect it. On the write side, we must acquire two locks,
808 * and on the read side, we just need acquire one of them.
809 */
810 u64 alloc_start;
811 struct btrfs_transaction *running_transaction; 803 struct btrfs_transaction *running_transaction;
812 wait_queue_head_t transaction_throttle; 804 wait_queue_head_t transaction_throttle;
813 wait_queue_head_t transaction_wait; 805 wait_queue_head_t transaction_wait;
@@ -1107,9 +1099,6 @@ struct btrfs_fs_info {
1107 */ 1099 */
1108 struct list_head pinned_chunks; 1100 struct list_head pinned_chunks;
1109 1101
1110 /* Used to record internally whether fs has been frozen */
1111 int fs_frozen;
1112
1113 /* Cached block sizes */ 1102 /* Cached block sizes */
1114 u32 nodesize; 1103 u32 nodesize;
1115 u32 sectorsize; 1104 u32 sectorsize;
@@ -1277,21 +1266,20 @@ struct btrfs_root {
1277 /* For qgroup metadata space reserve */ 1266 /* For qgroup metadata space reserve */
1278 atomic64_t qgroup_meta_rsv; 1267 atomic64_t qgroup_meta_rsv;
1279}; 1268};
1269
1280static inline u32 btrfs_inode_sectorsize(const struct inode *inode) 1270static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
1281{ 1271{
1282 return btrfs_sb(inode->i_sb)->sectorsize; 1272 return btrfs_sb(inode->i_sb)->sectorsize;
1283} 1273}
1284 1274
1285static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize)
1286{
1287 return blocksize - sizeof(struct btrfs_header);
1288}
1289
1290static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) 1275static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
1291{ 1276{
1292 return __BTRFS_LEAF_DATA_SIZE(info->nodesize); 1277
1278 return info->nodesize - sizeof(struct btrfs_header);
1293} 1279}
1294 1280
1281#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items)
1282
1295static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) 1283static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info)
1296{ 1284{
1297 return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); 1285 return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item);
@@ -1553,8 +1541,27 @@ static inline void btrfs_set_##name(type *s, u##bits val) \
1553 s->member = cpu_to_le##bits(val); \ 1541 s->member = cpu_to_le##bits(val); \
1554} 1542}
1555 1543
1544
1545static inline u64 btrfs_device_total_bytes(struct extent_buffer *eb,
1546 struct btrfs_dev_item *s)
1547{
1548 BUILD_BUG_ON(sizeof(u64) !=
1549 sizeof(((struct btrfs_dev_item *)0))->total_bytes);
1550 return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
1551 total_bytes));
1552}
1553static inline void btrfs_set_device_total_bytes(struct extent_buffer *eb,
1554 struct btrfs_dev_item *s,
1555 u64 val)
1556{
1557 BUILD_BUG_ON(sizeof(u64) !=
1558 sizeof(((struct btrfs_dev_item *)0))->total_bytes);
1559 WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
1560 btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
1561}
1562
1563
1556BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); 1564BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
1557BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
1558BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); 1565BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
1559BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); 1566BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
1560BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); 1567BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
@@ -2324,10 +2331,6 @@ static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
2324 return btrfs_csum_sizes[t]; 2331 return btrfs_csum_sizes[t];
2325} 2332}
2326 2333
2327static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
2328{
2329 return offsetof(struct btrfs_leaf, items);
2330}
2331 2334
2332/* 2335/*
2333 * The leaf data grows from end-to-front in the node. 2336 * The leaf data grows from end-to-front in the node.
@@ -2538,11 +2541,11 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
2538 2541
2539/* helper function to cast into the data area of the leaf. */ 2542/* helper function to cast into the data area of the leaf. */
2540#define btrfs_item_ptr(leaf, slot, type) \ 2543#define btrfs_item_ptr(leaf, slot, type) \
2541 ((type *)(btrfs_leaf_data(leaf) + \ 2544 ((type *)(BTRFS_LEAF_DATA_OFFSET + \
2542 btrfs_item_offset_nr(leaf, slot))) 2545 btrfs_item_offset_nr(leaf, slot)))
2543 2546
2544#define btrfs_item_ptr_offset(leaf, slot) \ 2547#define btrfs_item_ptr_offset(leaf, slot) \
2545 ((unsigned long)(btrfs_leaf_data(leaf) + \ 2548 ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
2546 btrfs_item_offset_nr(leaf, slot))) 2549 btrfs_item_offset_nr(leaf, slot)))
2547 2550
2548static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) 2551static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
@@ -2680,7 +2683,9 @@ void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
2680void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); 2683void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
2681void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 2684void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
2682 struct btrfs_fs_info *fs_info); 2685 struct btrfs_fs_info *fs_info);
2683u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 2686u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info);
2687u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info);
2688u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info);
2684void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2689void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2685 2690
2686enum btrfs_reserve_flush_enum { 2691enum btrfs_reserve_flush_enum {
@@ -2703,9 +2708,13 @@ enum btrfs_flush_state {
2703 COMMIT_TRANS = 6, 2708 COMMIT_TRANS = 6,
2704}; 2709};
2705 2710
2706int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
2707int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); 2711int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
2708void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); 2712int btrfs_check_data_free_space(struct inode *inode,
2713 struct extent_changeset **reserved, u64 start, u64 len);
2714void btrfs_free_reserved_data_space(struct inode *inode,
2715 struct extent_changeset *reserved, u64 start, u64 len);
2716void btrfs_delalloc_release_space(struct inode *inode,
2717 struct extent_changeset *reserved, u64 start, u64 len);
2709void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 2718void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
2710 u64 len); 2719 u64 len);
2711void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 2720void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2722,8 +2731,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
2722 struct btrfs_block_rsv *rsv); 2731 struct btrfs_block_rsv *rsv);
2723int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); 2732int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
2724void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); 2733void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
2725int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); 2734int btrfs_delalloc_reserve_space(struct inode *inode,
2726void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); 2735 struct extent_changeset **reserved, u64 start, u64 len);
2727void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); 2736void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
2728struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 2737struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
2729 unsigned short type); 2738 unsigned short type);
@@ -3031,12 +3040,14 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
3031 const char *name, u16 name_len, 3040 const char *name, u16 name_len,
3032 int mod); 3041 int mod);
3033int verify_dir_item(struct btrfs_fs_info *fs_info, 3042int verify_dir_item(struct btrfs_fs_info *fs_info,
3034 struct extent_buffer *leaf, 3043 struct extent_buffer *leaf, int slot,
3035 struct btrfs_dir_item *dir_item); 3044 struct btrfs_dir_item *dir_item);
3036struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, 3045struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
3037 struct btrfs_path *path, 3046 struct btrfs_path *path,
3038 const char *name, 3047 const char *name,
3039 int name_len); 3048 int name_len);
3049bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot,
3050 unsigned long start, u16 name_len);
3040 3051
3041/* orphan.c */ 3052/* orphan.c */
3042int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, 3053int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -3171,6 +3182,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
3171int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 3182int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
3172 size_t size, struct bio *bio, 3183 size_t size, struct bio *bio,
3173 unsigned long bio_flags); 3184 unsigned long bio_flags);
3185void btrfs_set_range_writeback(void *private_data, u64 start, u64 end);
3174int btrfs_page_mkwrite(struct vm_fault *vmf); 3186int btrfs_page_mkwrite(struct vm_fault *vmf);
3175int btrfs_readpage(struct file *file, struct page *page); 3187int btrfs_readpage(struct file *file, struct page *page);
3176void btrfs_evict_inode(struct inode *inode); 3188void btrfs_evict_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index be70d90dfee5..93ffa898df6d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -470,7 +470,8 @@ add_tail:
470static noinline void 470static noinline void
471update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, 471update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
472 struct btrfs_delayed_ref_node *existing, 472 struct btrfs_delayed_ref_node *existing,
473 struct btrfs_delayed_ref_node *update) 473 struct btrfs_delayed_ref_node *update,
474 int *old_ref_mod_ret)
474{ 475{
475 struct btrfs_delayed_ref_head *existing_ref; 476 struct btrfs_delayed_ref_head *existing_ref;
476 struct btrfs_delayed_ref_head *ref; 477 struct btrfs_delayed_ref_head *ref;
@@ -523,6 +524,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
523 * currently, for refs we just added we know we're a-ok. 524 * currently, for refs we just added we know we're a-ok.
524 */ 525 */
525 old_ref_mod = existing_ref->total_ref_mod; 526 old_ref_mod = existing_ref->total_ref_mod;
527 if (old_ref_mod_ret)
528 *old_ref_mod_ret = old_ref_mod;
526 existing->ref_mod += update->ref_mod; 529 existing->ref_mod += update->ref_mod;
527 existing_ref->total_ref_mod += update->ref_mod; 530 existing_ref->total_ref_mod += update->ref_mod;
528 531
@@ -550,7 +553,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
550 struct btrfs_delayed_ref_node *ref, 553 struct btrfs_delayed_ref_node *ref,
551 struct btrfs_qgroup_extent_record *qrecord, 554 struct btrfs_qgroup_extent_record *qrecord,
552 u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, 555 u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
553 int action, int is_data, int *qrecord_inserted_ret) 556 int action, int is_data, int *qrecord_inserted_ret,
557 int *old_ref_mod, int *new_ref_mod)
554{ 558{
555 struct btrfs_delayed_ref_head *existing; 559 struct btrfs_delayed_ref_head *existing;
556 struct btrfs_delayed_ref_head *head_ref = NULL; 560 struct btrfs_delayed_ref_head *head_ref = NULL;
@@ -638,7 +642,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
638 if (existing) { 642 if (existing) {
639 WARN_ON(ref_root && reserved && existing->qgroup_ref_root 643 WARN_ON(ref_root && reserved && existing->qgroup_ref_root
640 && existing->qgroup_reserved); 644 && existing->qgroup_reserved);
641 update_existing_head_ref(delayed_refs, &existing->node, ref); 645 update_existing_head_ref(delayed_refs, &existing->node, ref,
646 old_ref_mod);
642 /* 647 /*
643 * we've updated the existing ref, free the newly 648 * we've updated the existing ref, free the newly
644 * allocated ref 649 * allocated ref
@@ -646,6 +651,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
646 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 651 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
647 head_ref = existing; 652 head_ref = existing;
648 } else { 653 } else {
654 if (old_ref_mod)
655 *old_ref_mod = 0;
649 if (is_data && count_mod < 0) 656 if (is_data && count_mod < 0)
650 delayed_refs->pending_csums += num_bytes; 657 delayed_refs->pending_csums += num_bytes;
651 delayed_refs->num_heads++; 658 delayed_refs->num_heads++;
@@ -655,6 +662,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
655 } 662 }
656 if (qrecord_inserted_ret) 663 if (qrecord_inserted_ret)
657 *qrecord_inserted_ret = qrecord_inserted; 664 *qrecord_inserted_ret = qrecord_inserted;
665 if (new_ref_mod)
666 *new_ref_mod = head_ref->total_ref_mod;
658 return head_ref; 667 return head_ref;
659} 668}
660 669
@@ -778,7 +787,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
778 struct btrfs_trans_handle *trans, 787 struct btrfs_trans_handle *trans,
779 u64 bytenr, u64 num_bytes, u64 parent, 788 u64 bytenr, u64 num_bytes, u64 parent,
780 u64 ref_root, int level, int action, 789 u64 ref_root, int level, int action,
781 struct btrfs_delayed_extent_op *extent_op) 790 struct btrfs_delayed_extent_op *extent_op,
791 int *old_ref_mod, int *new_ref_mod)
782{ 792{
783 struct btrfs_delayed_tree_ref *ref; 793 struct btrfs_delayed_tree_ref *ref;
784 struct btrfs_delayed_ref_head *head_ref; 794 struct btrfs_delayed_ref_head *head_ref;
@@ -813,7 +823,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
813 */ 823 */
814 head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, 824 head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
815 bytenr, num_bytes, 0, 0, action, 0, 825 bytenr, num_bytes, 0, 0, action, 0,
816 &qrecord_inserted); 826 &qrecord_inserted, old_ref_mod,
827 new_ref_mod);
817 828
818 add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, 829 add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
819 num_bytes, parent, ref_root, level, action); 830 num_bytes, parent, ref_root, level, action);
@@ -838,7 +849,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
838 struct btrfs_trans_handle *trans, 849 struct btrfs_trans_handle *trans,
839 u64 bytenr, u64 num_bytes, 850 u64 bytenr, u64 num_bytes,
840 u64 parent, u64 ref_root, 851 u64 parent, u64 ref_root,
841 u64 owner, u64 offset, u64 reserved, int action) 852 u64 owner, u64 offset, u64 reserved, int action,
853 int *old_ref_mod, int *new_ref_mod)
842{ 854{
843 struct btrfs_delayed_data_ref *ref; 855 struct btrfs_delayed_data_ref *ref;
844 struct btrfs_delayed_ref_head *head_ref; 856 struct btrfs_delayed_ref_head *head_ref;
@@ -878,7 +890,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
878 */ 890 */
879 head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, 891 head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
880 bytenr, num_bytes, ref_root, reserved, 892 bytenr, num_bytes, ref_root, reserved,
881 action, 1, &qrecord_inserted); 893 action, 1, &qrecord_inserted,
894 old_ref_mod, new_ref_mod);
882 895
883 add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, 896 add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
884 num_bytes, parent, ref_root, owner, offset, 897 num_bytes, parent, ref_root, owner, offset,
@@ -909,7 +922,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
909 922
910 add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, 923 add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
911 num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, 924 num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
912 extent_op->is_data, NULL); 925 extent_op->is_data, NULL, NULL, NULL);
913 926
914 spin_unlock(&delayed_refs->lock); 927 spin_unlock(&delayed_refs->lock);
915 return 0; 928 return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c0264ff01b53..ce88e4ac5276 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -247,12 +247,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
247 struct btrfs_trans_handle *trans, 247 struct btrfs_trans_handle *trans,
248 u64 bytenr, u64 num_bytes, u64 parent, 248 u64 bytenr, u64 num_bytes, u64 parent,
249 u64 ref_root, int level, int action, 249 u64 ref_root, int level, int action,
250 struct btrfs_delayed_extent_op *extent_op); 250 struct btrfs_delayed_extent_op *extent_op,
251 int *old_ref_mod, int *new_ref_mod);
251int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, 252int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
252 struct btrfs_trans_handle *trans, 253 struct btrfs_trans_handle *trans,
253 u64 bytenr, u64 num_bytes, 254 u64 bytenr, u64 num_bytes,
254 u64 parent, u64 ref_root, 255 u64 parent, u64 ref_root,
255 u64 owner, u64 offset, u64 reserved, int action); 256 u64 owner, u64 offset, u64 reserved, int action,
257 int *old_ref_mod, int *new_ref_mod);
256int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, 258int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
257 struct btrfs_trans_handle *trans, 259 struct btrfs_trans_handle *trans,
258 u64 bytenr, u64 num_bytes, 260 u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 5fe1ca8abc70..bee3edeea7a3 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -388,7 +388,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
388 if (ret) 388 if (ret)
389 btrfs_err(fs_info, "kobj add dev failed %d", ret); 389 btrfs_err(fs_info, "kobj add dev failed %d", ret);
390 390
391 btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); 391 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
392 392
393 /* force writing the updated state information to disk */ 393 /* force writing the updated state information to disk */
394 trans = btrfs_start_transaction(root, 0); 394 trans = btrfs_start_transaction(root, 0);
@@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
507 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 507 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
508 return ret; 508 return ret;
509 } 509 }
510 btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); 510 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
511 511
512 trans = btrfs_start_transaction(root, 0); 512 trans = btrfs_start_transaction(root, 0);
513 if (IS_ERR(trans)) { 513 if (IS_ERR(trans)) {
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c24d615e3d7f..41cb9196eaa8 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -395,8 +395,6 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
395 395
396 leaf = path->nodes[0]; 396 leaf = path->nodes[0];
397 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); 397 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
398 if (verify_dir_item(fs_info, leaf, dir_item))
399 return NULL;
400 398
401 total_len = btrfs_item_size_nr(leaf, path->slots[0]); 399 total_len = btrfs_item_size_nr(leaf, path->slots[0]);
402 while (cur < total_len) { 400 while (cur < total_len) {
@@ -405,6 +403,8 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
405 btrfs_dir_data_len(leaf, dir_item); 403 btrfs_dir_data_len(leaf, dir_item);
406 name_ptr = (unsigned long)(dir_item + 1); 404 name_ptr = (unsigned long)(dir_item + 1);
407 405
406 if (verify_dir_item(fs_info, leaf, path->slots[0], dir_item))
407 return NULL;
408 if (btrfs_dir_name_len(leaf, dir_item) == name_len && 408 if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
409 memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) 409 memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
410 return dir_item; 410 return dir_item;
@@ -453,9 +453,11 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
453 453
454int verify_dir_item(struct btrfs_fs_info *fs_info, 454int verify_dir_item(struct btrfs_fs_info *fs_info,
455 struct extent_buffer *leaf, 455 struct extent_buffer *leaf,
456 int slot,
456 struct btrfs_dir_item *dir_item) 457 struct btrfs_dir_item *dir_item)
457{ 458{
458 u16 namelen = BTRFS_NAME_LEN; 459 u16 namelen = BTRFS_NAME_LEN;
460 int ret;
459 u8 type = btrfs_dir_type(leaf, dir_item); 461 u8 type = btrfs_dir_type(leaf, dir_item);
460 462
461 if (type >= BTRFS_FT_MAX) { 463 if (type >= BTRFS_FT_MAX) {
@@ -472,6 +474,12 @@ int verify_dir_item(struct btrfs_fs_info *fs_info,
472 return 1; 474 return 1;
473 } 475 }
474 476
477 namelen = btrfs_dir_name_len(leaf, dir_item);
478 ret = btrfs_is_name_len_valid(leaf, slot,
479 (unsigned long)(dir_item + 1), namelen);
480 if (!ret)
481 return 1;
482
475 /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ 483 /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
476 if ((btrfs_dir_data_len(leaf, dir_item) + 484 if ((btrfs_dir_data_len(leaf, dir_item) +
477 btrfs_dir_name_len(leaf, dir_item)) > 485 btrfs_dir_name_len(leaf, dir_item)) >
@@ -484,3 +492,67 @@ int verify_dir_item(struct btrfs_fs_info *fs_info,
484 492
485 return 0; 493 return 0;
486} 494}
495
496bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot,
497 unsigned long start, u16 name_len)
498{
499 struct btrfs_fs_info *fs_info = leaf->fs_info;
500 struct btrfs_key key;
501 u32 read_start;
502 u32 read_end;
503 u32 item_start;
504 u32 item_end;
505 u32 size;
506 bool ret = true;
507
508 ASSERT(start > BTRFS_LEAF_DATA_OFFSET);
509
510 read_start = start - BTRFS_LEAF_DATA_OFFSET;
511 read_end = read_start + name_len;
512 item_start = btrfs_item_offset_nr(leaf, slot);
513 item_end = btrfs_item_end_nr(leaf, slot);
514
515 btrfs_item_key_to_cpu(leaf, &key, slot);
516
517 switch (key.type) {
518 case BTRFS_DIR_ITEM_KEY:
519 case BTRFS_XATTR_ITEM_KEY:
520 case BTRFS_DIR_INDEX_KEY:
521 size = sizeof(struct btrfs_dir_item);
522 break;
523 case BTRFS_INODE_REF_KEY:
524 size = sizeof(struct btrfs_inode_ref);
525 break;
526 case BTRFS_INODE_EXTREF_KEY:
527 size = sizeof(struct btrfs_inode_extref);
528 break;
529 case BTRFS_ROOT_REF_KEY:
530 case BTRFS_ROOT_BACKREF_KEY:
531 size = sizeof(struct btrfs_root_ref);
532 break;
533 default:
534 ret = false;
535 goto out;
536 }
537
538 if (read_start < item_start) {
539 ret = false;
540 goto out;
541 }
542 if (read_end > item_end) {
543 ret = false;
544 goto out;
545 }
546
547 /* there shall be item(s) before name */
548 if (read_start - item_start < size) {
549 ret = false;
550 goto out;
551 }
552
553out:
554 if (!ret)
555 btrfs_crit(fs_info, "invalid dir item name len: %u",
556 (unsigned int)name_len);
557 return ret;
558}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6036d15b47b8..5abcbdc743fa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -89,7 +89,6 @@ struct btrfs_end_io_wq {
89 struct btrfs_fs_info *info; 89 struct btrfs_fs_info *info;
90 blk_status_t status; 90 blk_status_t status;
91 enum btrfs_wq_endio_type metadata; 91 enum btrfs_wq_endio_type metadata;
92 struct list_head list;
93 struct btrfs_work work; 92 struct btrfs_work work;
94}; 93};
95 94
@@ -118,9 +117,9 @@ void btrfs_end_io_wq_exit(void)
118 * just before they are sent down the IO stack. 117 * just before they are sent down the IO stack.
119 */ 118 */
120struct async_submit_bio { 119struct async_submit_bio {
121 struct inode *inode; 120 void *private_data;
121 struct btrfs_fs_info *fs_info;
122 struct bio *bio; 122 struct bio *bio;
123 struct list_head list;
124 extent_submit_bio_hook_t *submit_bio_start; 123 extent_submit_bio_hook_t *submit_bio_start;
125 extent_submit_bio_hook_t *submit_bio_done; 124 extent_submit_bio_hook_t *submit_bio_done;
126 int mirror_num; 125 int mirror_num;
@@ -871,7 +870,7 @@ static void run_one_async_start(struct btrfs_work *work)
871 blk_status_t ret; 870 blk_status_t ret;
872 871
873 async = container_of(work, struct async_submit_bio, work); 872 async = container_of(work, struct async_submit_bio, work);
874 ret = async->submit_bio_start(async->inode, async->bio, 873 ret = async->submit_bio_start(async->private_data, async->bio,
875 async->mirror_num, async->bio_flags, 874 async->mirror_num, async->bio_flags,
876 async->bio_offset); 875 async->bio_offset);
877 if (ret) 876 if (ret)
@@ -885,7 +884,7 @@ static void run_one_async_done(struct btrfs_work *work)
885 int limit; 884 int limit;
886 885
887 async = container_of(work, struct async_submit_bio, work); 886 async = container_of(work, struct async_submit_bio, work);
888 fs_info = BTRFS_I(async->inode)->root->fs_info; 887 fs_info = async->fs_info;
889 888
890 limit = btrfs_async_submit_limit(fs_info); 889 limit = btrfs_async_submit_limit(fs_info);
891 limit = limit * 2 / 3; 890 limit = limit * 2 / 3;
@@ -904,7 +903,7 @@ static void run_one_async_done(struct btrfs_work *work)
904 return; 903 return;
905 } 904 }
906 905
907 async->submit_bio_done(async->inode, async->bio, async->mirror_num, 906 async->submit_bio_done(async->private_data, async->bio, async->mirror_num,
908 async->bio_flags, async->bio_offset); 907 async->bio_flags, async->bio_offset);
909} 908}
910 909
@@ -916,11 +915,11 @@ static void run_one_async_free(struct btrfs_work *work)
916 kfree(async); 915 kfree(async);
917} 916}
918 917
919blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, 918blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
920 struct inode *inode, struct bio *bio, int mirror_num, 919 int mirror_num, unsigned long bio_flags,
921 unsigned long bio_flags, u64 bio_offset, 920 u64 bio_offset, void *private_data,
922 extent_submit_bio_hook_t *submit_bio_start, 921 extent_submit_bio_hook_t *submit_bio_start,
923 extent_submit_bio_hook_t *submit_bio_done) 922 extent_submit_bio_hook_t *submit_bio_done)
924{ 923{
925 struct async_submit_bio *async; 924 struct async_submit_bio *async;
926 925
@@ -928,7 +927,8 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info,
928 if (!async) 927 if (!async)
929 return BLK_STS_RESOURCE; 928 return BLK_STS_RESOURCE;
930 929
931 async->inode = inode; 930 async->private_data = private_data;
931 async->fs_info = fs_info;
932 async->bio = bio; 932 async->bio = bio;
933 async->mirror_num = mirror_num; 933 async->mirror_num = mirror_num;
934 async->submit_bio_start = submit_bio_start; 934 async->submit_bio_start = submit_bio_start;
@@ -974,9 +974,9 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
974 return errno_to_blk_status(ret); 974 return errno_to_blk_status(ret);
975} 975}
976 976
977static blk_status_t __btree_submit_bio_start(struct inode *inode, 977static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio,
978 struct bio *bio, int mirror_num, unsigned long bio_flags, 978 int mirror_num, unsigned long bio_flags,
979 u64 bio_offset) 979 u64 bio_offset)
980{ 980{
981 /* 981 /*
982 * when we're called for a write, we're already in the async 982 * when we're called for a write, we're already in the async
@@ -985,10 +985,11 @@ static blk_status_t __btree_submit_bio_start(struct inode *inode,
985 return btree_csum_one_bio(bio); 985 return btree_csum_one_bio(bio);
986} 986}
987 987
988static blk_status_t __btree_submit_bio_done(struct inode *inode, 988static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
989 struct bio *bio, int mirror_num, unsigned long bio_flags, 989 int mirror_num, unsigned long bio_flags,
990 u64 bio_offset) 990 u64 bio_offset)
991{ 991{
992 struct inode *inode = private_data;
992 blk_status_t ret; 993 blk_status_t ret;
993 994
994 /* 995 /*
@@ -1014,10 +1015,11 @@ static int check_async_write(unsigned long bio_flags)
1014 return 1; 1015 return 1;
1015} 1016}
1016 1017
1017static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, 1018static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
1018 int mirror_num, unsigned long bio_flags, 1019 int mirror_num, unsigned long bio_flags,
1019 u64 bio_offset) 1020 u64 bio_offset)
1020{ 1021{
1022 struct inode *inode = private_data;
1021 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1023 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1022 int async = check_async_write(bio_flags); 1024 int async = check_async_write(bio_flags);
1023 blk_status_t ret; 1025 blk_status_t ret;
@@ -1042,8 +1044,8 @@ static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
1042 * kthread helpers are used to submit writes so that 1044 * kthread helpers are used to submit writes so that
1043 * checksumming can happen in parallel across all CPUs 1045 * checksumming can happen in parallel across all CPUs
1044 */ 1046 */
1045 ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, 0, 1047 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
1046 bio_offset, 1048 bio_offset, private_data,
1047 __btree_submit_bio_start, 1049 __btree_submit_bio_start,
1048 __btree_submit_bio_done); 1050 __btree_submit_bio_done);
1049 } 1051 }
@@ -1221,10 +1223,10 @@ int btrfs_write_tree_block(struct extent_buffer *buf)
1221 buf->start + buf->len - 1); 1223 buf->start + buf->len - 1);
1222} 1224}
1223 1225
1224int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 1226void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
1225{ 1227{
1226 return filemap_fdatawait_range(buf->pages[0]->mapping, 1228 filemap_fdatawait_range(buf->pages[0]->mapping,
1227 buf->start, buf->start + buf->len - 1); 1229 buf->start, buf->start + buf->len - 1);
1228} 1230}
1229 1231
1230struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, 1232struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
@@ -1346,8 +1348,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1346 root->log_transid_committed = -1; 1348 root->log_transid_committed = -1;
1347 root->last_log_commit = 0; 1349 root->last_log_commit = 0;
1348 if (!dummy) 1350 if (!dummy)
1349 extent_io_tree_init(&root->dirty_log_pages, 1351 extent_io_tree_init(&root->dirty_log_pages, NULL);
1350 fs_info->btree_inode->i_mapping);
1351 1352
1352 memset(&root->root_key, 0, sizeof(root->root_key)); 1353 memset(&root->root_key, 0, sizeof(root->root_key));
1353 memset(&root->root_item, 0, sizeof(root->root_item)); 1354 memset(&root->root_item, 0, sizeof(root->root_item));
@@ -2308,7 +2309,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
2308 inode->i_mapping->a_ops = &btree_aops; 2309 inode->i_mapping->a_ops = &btree_aops;
2309 2310
2310 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 2311 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
2311 extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping); 2312 extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode);
2312 BTRFS_I(inode)->io_tree.track_uptodate = 0; 2313 BTRFS_I(inode)->io_tree.track_uptodate = 0;
2313 extent_map_tree_init(&BTRFS_I(inode)->extent_tree); 2314 extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
2314 2315
@@ -2625,7 +2626,6 @@ int open_ctree(struct super_block *sb,
2625 spin_lock_init(&fs_info->fs_roots_radix_lock); 2626 spin_lock_init(&fs_info->fs_roots_radix_lock);
2626 spin_lock_init(&fs_info->delayed_iput_lock); 2627 spin_lock_init(&fs_info->delayed_iput_lock);
2627 spin_lock_init(&fs_info->defrag_inodes_lock); 2628 spin_lock_init(&fs_info->defrag_inodes_lock);
2628 spin_lock_init(&fs_info->free_chunk_lock);
2629 spin_lock_init(&fs_info->tree_mod_seq_lock); 2629 spin_lock_init(&fs_info->tree_mod_seq_lock);
2630 spin_lock_init(&fs_info->super_lock); 2630 spin_lock_init(&fs_info->super_lock);
2631 spin_lock_init(&fs_info->qgroup_op_lock); 2631 spin_lock_init(&fs_info->qgroup_op_lock);
@@ -2661,12 +2661,11 @@ int open_ctree(struct super_block *sb,
2661 atomic_set(&fs_info->qgroup_op_seq, 0); 2661 atomic_set(&fs_info->qgroup_op_seq, 0);
2662 atomic_set(&fs_info->reada_works_cnt, 0); 2662 atomic_set(&fs_info->reada_works_cnt, 0);
2663 atomic64_set(&fs_info->tree_mod_seq, 0); 2663 atomic64_set(&fs_info->tree_mod_seq, 0);
2664 fs_info->fs_frozen = 0;
2665 fs_info->sb = sb; 2664 fs_info->sb = sb;
2666 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; 2665 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
2667 fs_info->metadata_ratio = 0; 2666 fs_info->metadata_ratio = 0;
2668 fs_info->defrag_inodes = RB_ROOT; 2667 fs_info->defrag_inodes = RB_ROOT;
2669 fs_info->free_chunk_space = 0; 2668 atomic64_set(&fs_info->free_chunk_space, 0);
2670 fs_info->tree_mod_log = RB_ROOT; 2669 fs_info->tree_mod_log = RB_ROOT;
2671 fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; 2670 fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2672 fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ 2671 fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
@@ -2703,10 +2702,8 @@ int open_ctree(struct super_block *sb,
2703 fs_info->block_group_cache_tree = RB_ROOT; 2702 fs_info->block_group_cache_tree = RB_ROOT;
2704 fs_info->first_logical_byte = (u64)-1; 2703 fs_info->first_logical_byte = (u64)-1;
2705 2704
2706 extent_io_tree_init(&fs_info->freed_extents[0], 2705 extent_io_tree_init(&fs_info->freed_extents[0], NULL);
2707 fs_info->btree_inode->i_mapping); 2706 extent_io_tree_init(&fs_info->freed_extents[1], NULL);
2708 extent_io_tree_init(&fs_info->freed_extents[1],
2709 fs_info->btree_inode->i_mapping);
2710 fs_info->pinned_extents = &fs_info->freed_extents[0]; 2707 fs_info->pinned_extents = &fs_info->freed_extents[0];
2711 set_bit(BTRFS_FS_BARRIER, &fs_info->flags); 2708 set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
2712 2709
@@ -3484,65 +3481,61 @@ static int write_dev_supers(struct btrfs_device *device,
3484 */ 3481 */
3485static void btrfs_end_empty_barrier(struct bio *bio) 3482static void btrfs_end_empty_barrier(struct bio *bio)
3486{ 3483{
3487 if (bio->bi_private) 3484 complete(bio->bi_private);
3488 complete(bio->bi_private);
3489 bio_put(bio);
3490} 3485}
3491 3486
3492/* 3487/*
3493 * trigger flushes for one the devices. If you pass wait == 0, the flushes are 3488 * Submit a flush request to the device if it supports it. Error handling is
3494 * sent down. With wait == 1, it waits for the previous flush. 3489 * done in the waiting counterpart.
3495 *
3496 * any device where the flush fails with eopnotsupp are flagged as not-barrier
3497 * capable
3498 */ 3490 */
3499static blk_status_t write_dev_flush(struct btrfs_device *device, int wait) 3491static void write_dev_flush(struct btrfs_device *device)
3500{ 3492{
3501 struct request_queue *q = bdev_get_queue(device->bdev); 3493 struct request_queue *q = bdev_get_queue(device->bdev);
3502 struct bio *bio; 3494 struct bio *bio = device->flush_bio;
3503 blk_status_t ret = 0;
3504 3495
3505 if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) 3496 if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
3506 return 0; 3497 return;
3507 3498
3508 if (wait) { 3499 bio_reset(bio);
3509 bio = device->flush_bio; 3500 bio->bi_end_io = btrfs_end_empty_barrier;
3510 if (!bio) 3501 bio->bi_bdev = device->bdev;
3511 return 0; 3502 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
3503 init_completion(&device->flush_wait);
3504 bio->bi_private = &device->flush_wait;
3512 3505
3513 wait_for_completion(&device->flush_wait); 3506 submit_bio(bio);
3507 device->flush_bio_sent = 1;
3508}
3514 3509
3515 if (bio->bi_status) { 3510/*
3516 ret = bio->bi_status; 3511 * If the flush bio has been submitted by write_dev_flush, wait for it.
3517 btrfs_dev_stat_inc_and_print(device, 3512 */
3518 BTRFS_DEV_STAT_FLUSH_ERRS); 3513static blk_status_t wait_dev_flush(struct btrfs_device *device)
3519 } 3514{
3515 struct bio *bio = device->flush_bio;
3520 3516
3521 /* drop the reference from the wait == 0 run */ 3517 if (!device->flush_bio_sent)
3522 bio_put(bio); 3518 return 0;
3523 device->flush_bio = NULL;
3524 3519
3525 return ret; 3520 device->flush_bio_sent = 0;
3526 } 3521 wait_for_completion_io(&device->flush_wait);
3527 3522
3528 /* 3523 return bio->bi_status;
3529 * one reference for us, and we leave it for the 3524}
3530 * caller
3531 */
3532 device->flush_bio = NULL;
3533 bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
3534 if (!bio)
3535 return BLK_STS_RESOURCE;
3536 3525
3537 bio->bi_end_io = btrfs_end_empty_barrier; 3526static int check_barrier_error(struct btrfs_fs_devices *fsdevs)
3538 bio->bi_bdev = device->bdev; 3527{
3539 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; 3528 int dev_flush_error = 0;
3540 init_completion(&device->flush_wait); 3529 struct btrfs_device *dev;
3541 bio->bi_private = &device->flush_wait;
3542 device->flush_bio = bio;
3543 3530
3544 bio_get(bio); 3531 list_for_each_entry_rcu(dev, &fsdevs->devices, dev_list) {
3545 btrfsic_submit_bio(bio); 3532 if (!dev->bdev || dev->last_flush_error)
3533 dev_flush_error++;
3534 }
3535
3536 if (dev_flush_error >
3537 fsdevs->fs_info->num_tolerated_disk_barrier_failures)
3538 return -EIO;
3546 3539
3547 return 0; 3540 return 0;
3548} 3541}
@@ -3555,7 +3548,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3555{ 3548{
3556 struct list_head *head; 3549 struct list_head *head;
3557 struct btrfs_device *dev; 3550 struct btrfs_device *dev;
3558 int errors_send = 0;
3559 int errors_wait = 0; 3551 int errors_wait = 0;
3560 blk_status_t ret; 3552 blk_status_t ret;
3561 3553
@@ -3564,16 +3556,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3564 list_for_each_entry_rcu(dev, head, dev_list) { 3556 list_for_each_entry_rcu(dev, head, dev_list) {
3565 if (dev->missing) 3557 if (dev->missing)
3566 continue; 3558 continue;
3567 if (!dev->bdev) { 3559 if (!dev->bdev)
3568 errors_send++;
3569 continue; 3560 continue;
3570 }
3571 if (!dev->in_fs_metadata || !dev->writeable) 3561 if (!dev->in_fs_metadata || !dev->writeable)
3572 continue; 3562 continue;
3573 3563
3574 ret = write_dev_flush(dev, 0); 3564 write_dev_flush(dev);
3575 if (ret) 3565 dev->last_flush_error = 0;
3576 errors_send++;
3577 } 3566 }
3578 3567
3579 /* wait for all the barriers */ 3568 /* wait for all the barriers */
@@ -3587,13 +3576,23 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3587 if (!dev->in_fs_metadata || !dev->writeable) 3576 if (!dev->in_fs_metadata || !dev->writeable)
3588 continue; 3577 continue;
3589 3578
3590 ret = write_dev_flush(dev, 1); 3579 ret = wait_dev_flush(dev);
3591 if (ret) 3580 if (ret) {
3581 dev->last_flush_error = ret;
3582 btrfs_dev_stat_inc_and_print(dev,
3583 BTRFS_DEV_STAT_FLUSH_ERRS);
3592 errors_wait++; 3584 errors_wait++;
3585 }
3586 }
3587
3588 if (errors_wait) {
3589 /*
3590 * At some point we need the status of all disks
3591 * to arrive at the volume status. So error checking
3592 * is being pushed to a separate loop.
3593 */
3594 return check_barrier_error(info->fs_devices);
3593 } 3595 }
3594 if (errors_send > info->num_tolerated_disk_barrier_failures ||
3595 errors_wait > info->num_tolerated_disk_barrier_failures)
3596 return -EIO;
3597 return 0; 3596 return 0;
3598} 3597}
3599 3598
@@ -4577,11 +4576,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4577 4576
4578 cur_trans->state =TRANS_STATE_COMPLETED; 4577 cur_trans->state =TRANS_STATE_COMPLETED;
4579 wake_up(&cur_trans->commit_wait); 4578 wake_up(&cur_trans->commit_wait);
4580
4581 /*
4582 memset(cur_trans, 0, sizeof(*cur_trans));
4583 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
4584 */
4585} 4579}
4586 4580
4587static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) 4581static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
@@ -4637,6 +4631,12 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
4637 return 0; 4631 return 0;
4638} 4632}
4639 4633
4634static struct btrfs_fs_info *btree_fs_info(void *private_data)
4635{
4636 struct inode *inode = private_data;
4637 return btrfs_sb(inode->i_sb);
4638}
4639
4640static const struct extent_io_ops btree_extent_io_ops = { 4640static const struct extent_io_ops btree_extent_io_ops = {
4641 /* mandatory callbacks */ 4641 /* mandatory callbacks */
4642 .submit_bio_hook = btree_submit_bio_hook, 4642 .submit_bio_hook = btree_submit_bio_hook,
@@ -4644,6 +4644,8 @@ static const struct extent_io_ops btree_extent_io_ops = {
4644 /* note we're sharing with inode.c for the merge bio hook */ 4644 /* note we're sharing with inode.c for the merge bio hook */
4645 .merge_bio_hook = btrfs_merge_bio_hook, 4645 .merge_bio_hook = btrfs_merge_bio_hook,
4646 .readpage_io_failed_hook = btree_io_failed_hook, 4646 .readpage_io_failed_hook = btree_io_failed_hook,
4647 .set_range_writeback = btrfs_set_range_writeback,
4648 .tree_fs_info = btree_fs_info,
4647 4649
4648 /* optional callbacks */ 4650 /* optional callbacks */
4649}; 4651};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c581927555f3..0a634d3ffc16 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -120,14 +120,14 @@ u32 btrfs_csum_data(const char *data, u32 seed, size_t len);
120void btrfs_csum_final(u32 crc, u8 *result); 120void btrfs_csum_final(u32 crc, u8 *result);
121blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 121blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
122 enum btrfs_wq_endio_type metadata); 122 enum btrfs_wq_endio_type metadata);
123blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, 123blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
124 struct inode *inode, struct bio *bio, int mirror_num, 124 int mirror_num, unsigned long bio_flags,
125 unsigned long bio_flags, u64 bio_offset, 125 u64 bio_offset, void *private_data,
126 extent_submit_bio_hook_t *submit_bio_start, 126 extent_submit_bio_hook_t *submit_bio_start,
127 extent_submit_bio_hook_t *submit_bio_done); 127 extent_submit_bio_hook_t *submit_bio_done);
128unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); 128unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
129int btrfs_write_tree_block(struct extent_buffer *buf); 129int btrfs_write_tree_block(struct extent_buffer *buf);
130int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); 130void btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
131int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 131int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
132 struct btrfs_fs_info *fs_info); 132 struct btrfs_fs_info *fs_info);
133int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 133int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 87144c9f9593..fa66980726c9 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -282,6 +282,11 @@ static int btrfs_get_name(struct dentry *parent, char *name,
282 name_len = btrfs_inode_ref_name_len(leaf, iref); 282 name_len = btrfs_inode_ref_name_len(leaf, iref);
283 } 283 }
284 284
285 ret = btrfs_is_name_len_valid(leaf, path->slots[0], name_ptr, name_len);
286 if (!ret) {
287 btrfs_free_path(path);
288 return -EIO;
289 }
285 read_extent_buffer(leaf, name, name_ptr, name_len); 290 read_extent_buffer(leaf, name, name_ptr, name_len);
286 btrfs_free_path(path); 291 btrfs_free_path(path);
287 292
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 33d979e9ea2a..375f8c728d91 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -97,10 +97,11 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
97 u64 num_bytes, int delalloc); 97 u64 num_bytes, int delalloc);
98static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 98static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
99 u64 num_bytes); 99 u64 num_bytes);
100static int __reserve_metadata_bytes(struct btrfs_root *root, 100static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
101 struct btrfs_space_info *space_info, 101 struct btrfs_space_info *space_info,
102 u64 orig_bytes, 102 u64 orig_bytes,
103 enum btrfs_reserve_flush_enum flush); 103 enum btrfs_reserve_flush_enum flush,
104 bool system_chunk);
104static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 105static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
105 struct btrfs_space_info *space_info, 106 struct btrfs_space_info *space_info,
106 u64 num_bytes); 107 u64 num_bytes);
@@ -766,6 +767,26 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
766 return NULL; 767 return NULL;
767} 768}
768 769
770static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
771 u64 owner, u64 root_objectid)
772{
773 struct btrfs_space_info *space_info;
774 u64 flags;
775
776 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
777 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
778 flags = BTRFS_BLOCK_GROUP_SYSTEM;
779 else
780 flags = BTRFS_BLOCK_GROUP_METADATA;
781 } else {
782 flags = BTRFS_BLOCK_GROUP_DATA;
783 }
784
785 space_info = __find_space_info(fs_info, flags);
786 ASSERT(space_info);
787 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
788}
789
769/* 790/*
770 * after adding space to the filesystem, we need to clear the full flags 791 * after adding space to the filesystem, we need to clear the full flags
771 * on all the space infos. 792 * on all the space infos.
@@ -2092,6 +2113,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2092 u64 bytenr, u64 num_bytes, u64 parent, 2113 u64 bytenr, u64 num_bytes, u64 parent,
2093 u64 root_objectid, u64 owner, u64 offset) 2114 u64 root_objectid, u64 owner, u64 offset)
2094{ 2115{
2116 int old_ref_mod, new_ref_mod;
2095 int ret; 2117 int ret;
2096 2118
2097 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 2119 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
@@ -2099,15 +2121,21 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2099 2121
2100 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 2122 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2101 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 2123 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2102 num_bytes, 2124 num_bytes, parent,
2103 parent, root_objectid, (int)owner, 2125 root_objectid, (int)owner,
2104 BTRFS_ADD_DELAYED_REF, NULL); 2126 BTRFS_ADD_DELAYED_REF, NULL,
2127 &old_ref_mod, &new_ref_mod);
2105 } else { 2128 } else {
2106 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 2129 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2107 num_bytes, parent, root_objectid, 2130 num_bytes, parent,
2108 owner, offset, 0, 2131 root_objectid, owner, offset,
2109 BTRFS_ADD_DELAYED_REF); 2132 0, BTRFS_ADD_DELAYED_REF,
2133 &old_ref_mod, &new_ref_mod);
2110 } 2134 }
2135
2136 if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
2137 add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid);
2138
2111 return ret; 2139 return ret;
2112} 2140}
2113 2141
@@ -2411,6 +2439,16 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2411 head = btrfs_delayed_node_to_head(node); 2439 head = btrfs_delayed_node_to_head(node);
2412 trace_run_delayed_ref_head(fs_info, node, head, node->action); 2440 trace_run_delayed_ref_head(fs_info, node, head, node->action);
2413 2441
2442 if (head->total_ref_mod < 0) {
2443 struct btrfs_block_group_cache *cache;
2444
2445 cache = btrfs_lookup_block_group(fs_info, node->bytenr);
2446 ASSERT(cache);
2447 percpu_counter_add(&cache->space_info->total_bytes_pinned,
2448 -node->num_bytes);
2449 btrfs_put_block_group(cache);
2450 }
2451
2414 if (insert_reserved) { 2452 if (insert_reserved) {
2415 btrfs_pin_extent(fs_info, node->bytenr, 2453 btrfs_pin_extent(fs_info, node->bytenr,
2416 node->num_bytes, 1); 2454 node->num_bytes, 1);
@@ -3364,6 +3402,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3364 struct btrfs_fs_info *fs_info = block_group->fs_info; 3402 struct btrfs_fs_info *fs_info = block_group->fs_info;
3365 struct btrfs_root *root = fs_info->tree_root; 3403 struct btrfs_root *root = fs_info->tree_root;
3366 struct inode *inode = NULL; 3404 struct inode *inode = NULL;
3405 struct extent_changeset *data_reserved = NULL;
3367 u64 alloc_hint = 0; 3406 u64 alloc_hint = 0;
3368 int dcs = BTRFS_DC_ERROR; 3407 int dcs = BTRFS_DC_ERROR;
3369 u64 num_pages = 0; 3408 u64 num_pages = 0;
@@ -3483,7 +3522,7 @@ again:
3483 num_pages *= 16; 3522 num_pages *= 16;
3484 num_pages *= PAGE_SIZE; 3523 num_pages *= PAGE_SIZE;
3485 3524
3486 ret = btrfs_check_data_free_space(inode, 0, num_pages); 3525 ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3487 if (ret) 3526 if (ret)
3488 goto out_put; 3527 goto out_put;
3489 3528
@@ -3514,6 +3553,7 @@ out:
3514 block_group->disk_cache_state = dcs; 3553 block_group->disk_cache_state = dcs;
3515 spin_unlock(&block_group->lock); 3554 spin_unlock(&block_group->lock);
3516 3555
3556 extent_changeset_free(data_reserved);
3517 return ret; 3557 return ret;
3518} 3558}
3519 3559
@@ -3924,88 +3964,83 @@ static const char *alloc_name(u64 flags)
3924 }; 3964 };
3925} 3965}
3926 3966
3927static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3967static int create_space_info(struct btrfs_fs_info *info, u64 flags,
3928 u64 total_bytes, u64 bytes_used, 3968 struct btrfs_space_info **new)
3929 u64 bytes_readonly,
3930 struct btrfs_space_info **space_info)
3931{ 3969{
3932 struct btrfs_space_info *found; 3970
3971 struct btrfs_space_info *space_info;
3933 int i; 3972 int i;
3934 int factor;
3935 int ret; 3973 int ret;
3936 3974
3937 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3975 space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
3938 BTRFS_BLOCK_GROUP_RAID10)) 3976 if (!space_info)
3939 factor = 2;
3940 else
3941 factor = 1;
3942
3943 found = __find_space_info(info, flags);
3944 if (found) {
3945 spin_lock(&found->lock);
3946 found->total_bytes += total_bytes;
3947 found->disk_total += total_bytes * factor;
3948 found->bytes_used += bytes_used;
3949 found->disk_used += bytes_used * factor;
3950 found->bytes_readonly += bytes_readonly;
3951 if (total_bytes > 0)
3952 found->full = 0;
3953 space_info_add_new_bytes(info, found, total_bytes -
3954 bytes_used - bytes_readonly);
3955 spin_unlock(&found->lock);
3956 *space_info = found;
3957 return 0;
3958 }
3959 found = kzalloc(sizeof(*found), GFP_NOFS);
3960 if (!found)
3961 return -ENOMEM; 3977 return -ENOMEM;
3962 3978
3963 ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); 3979 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
3980 GFP_KERNEL);
3964 if (ret) { 3981 if (ret) {
3965 kfree(found); 3982 kfree(space_info);
3966 return ret; 3983 return ret;
3967 } 3984 }
3968 3985
3969 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3986 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3970 INIT_LIST_HEAD(&found->block_groups[i]); 3987 INIT_LIST_HEAD(&space_info->block_groups[i]);
3971 init_rwsem(&found->groups_sem); 3988 init_rwsem(&space_info->groups_sem);
3972 spin_lock_init(&found->lock); 3989 spin_lock_init(&space_info->lock);
3973 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3990 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3974 found->total_bytes = total_bytes; 3991 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3975 found->disk_total = total_bytes * factor; 3992 init_waitqueue_head(&space_info->wait);
3976 found->bytes_used = bytes_used; 3993 INIT_LIST_HEAD(&space_info->ro_bgs);
3977 found->disk_used = bytes_used * factor; 3994 INIT_LIST_HEAD(&space_info->tickets);
3978 found->bytes_pinned = 0; 3995 INIT_LIST_HEAD(&space_info->priority_tickets);
3979 found->bytes_reserved = 0; 3996
3980 found->bytes_readonly = bytes_readonly; 3997 ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
3981 found->bytes_may_use = 0;
3982 found->full = 0;
3983 found->max_extent_size = 0;
3984 found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3985 found->chunk_alloc = 0;
3986 found->flush = 0;
3987 init_waitqueue_head(&found->wait);
3988 INIT_LIST_HEAD(&found->ro_bgs);
3989 INIT_LIST_HEAD(&found->tickets);
3990 INIT_LIST_HEAD(&found->priority_tickets);
3991
3992 ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3993 info->space_info_kobj, "%s", 3998 info->space_info_kobj, "%s",
3994 alloc_name(found->flags)); 3999 alloc_name(space_info->flags));
3995 if (ret) { 4000 if (ret) {
3996 percpu_counter_destroy(&found->total_bytes_pinned); 4001 percpu_counter_destroy(&space_info->total_bytes_pinned);
3997 kfree(found); 4002 kfree(space_info);
3998 return ret; 4003 return ret;
3999 } 4004 }
4000 4005
4001 *space_info = found; 4006 *new = space_info;
4002 list_add_rcu(&found->list, &info->space_info); 4007 list_add_rcu(&space_info->list, &info->space_info);
4003 if (flags & BTRFS_BLOCK_GROUP_DATA) 4008 if (flags & BTRFS_BLOCK_GROUP_DATA)
4004 info->data_sinfo = found; 4009 info->data_sinfo = space_info;
4005 4010
4006 return ret; 4011 return ret;
4007} 4012}
4008 4013
4014static void update_space_info(struct btrfs_fs_info *info, u64 flags,
4015 u64 total_bytes, u64 bytes_used,
4016 u64 bytes_readonly,
4017 struct btrfs_space_info **space_info)
4018{
4019 struct btrfs_space_info *found;
4020 int factor;
4021
4022 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
4023 BTRFS_BLOCK_GROUP_RAID10))
4024 factor = 2;
4025 else
4026 factor = 1;
4027
4028 found = __find_space_info(info, flags);
4029 ASSERT(found);
4030 spin_lock(&found->lock);
4031 found->total_bytes += total_bytes;
4032 found->disk_total += total_bytes * factor;
4033 found->bytes_used += bytes_used;
4034 found->disk_used += bytes_used * factor;
4035 found->bytes_readonly += bytes_readonly;
4036 if (total_bytes > 0)
4037 found->full = 0;
4038 space_info_add_new_bytes(info, found, total_bytes -
4039 bytes_used - bytes_readonly);
4040 spin_unlock(&found->lock);
4041 *space_info = found;
4042}
4043
4009static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 4044static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
4010{ 4045{
4011 u64 extra_flags = chunk_to_extended(flags) & 4046 u64 extra_flags = chunk_to_extended(flags) &
@@ -4121,7 +4156,7 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
4121 return btrfs_reduce_alloc_profile(fs_info, flags); 4156 return btrfs_reduce_alloc_profile(fs_info, flags);
4122} 4157}
4123 4158
4124u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 4159static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
4125{ 4160{
4126 struct btrfs_fs_info *fs_info = root->fs_info; 4161 struct btrfs_fs_info *fs_info = root->fs_info;
4127 u64 flags; 4162 u64 flags;
@@ -4138,6 +4173,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
4138 return ret; 4173 return ret;
4139} 4174}
4140 4175
4176u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
4177{
4178 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
4179}
4180
4181u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
4182{
4183 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4184}
4185
4186u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
4187{
4188 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4189}
4190
4141static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 4191static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
4142 bool may_use_included) 4192 bool may_use_included)
4143{ 4193{
@@ -4187,7 +4237,7 @@ again:
4187 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 4237 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4188 spin_unlock(&data_sinfo->lock); 4238 spin_unlock(&data_sinfo->lock);
4189alloc: 4239alloc:
4190 alloc_target = btrfs_get_alloc_profile(root, 1); 4240 alloc_target = btrfs_data_alloc_profile(fs_info);
4191 /* 4241 /*
4192 * It is ugly that we don't call nolock join 4242 * It is ugly that we don't call nolock join
4193 * transaction for the free space inode case here. 4243 * transaction for the free space inode case here.
@@ -4238,7 +4288,7 @@ commit_trans:
4238 4288
4239 if (need_commit > 0) { 4289 if (need_commit > 0) {
4240 btrfs_start_delalloc_roots(fs_info, 0, -1); 4290 btrfs_start_delalloc_roots(fs_info, 0, -1);
4241 btrfs_wait_ordered_roots(fs_info, -1, 0, 4291 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4242 (u64)-1); 4292 (u64)-1);
4243 } 4293 }
4244 4294
@@ -4278,12 +4328,8 @@ commit_trans:
4278 return ret; 4328 return ret;
4279} 4329}
4280 4330
4281/* 4331int btrfs_check_data_free_space(struct inode *inode,
4282 * New check_data_free_space() with ability for precious data reservation 4332 struct extent_changeset **reserved, u64 start, u64 len)
4283 * Will replace old btrfs_check_data_free_space(), but for patch split,
4284 * add a new function first and then replace it.
4285 */
4286int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
4287{ 4333{
4288 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4334 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4289 int ret; 4335 int ret;
@@ -4298,9 +4344,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
4298 return ret; 4344 return ret;
4299 4345
4300 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ 4346 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4301 ret = btrfs_qgroup_reserve_data(inode, start, len); 4347 ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4302 if (ret) 4348 if (ret < 0)
4303 btrfs_free_reserved_data_space_noquota(inode, start, len); 4349 btrfs_free_reserved_data_space_noquota(inode, start, len);
4350 else
4351 ret = 0;
4304 return ret; 4352 return ret;
4305} 4353}
4306 4354
@@ -4341,7 +4389,8 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4341 * This one will handle the per-inode data rsv map for accurate reserved 4389 * This one will handle the per-inode data rsv map for accurate reserved
4342 * space framework. 4390 * space framework.
4343 */ 4391 */
4344void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) 4392void btrfs_free_reserved_data_space(struct inode *inode,
4393 struct extent_changeset *reserved, u64 start, u64 len)
4345{ 4394{
4346 struct btrfs_root *root = BTRFS_I(inode)->root; 4395 struct btrfs_root *root = BTRFS_I(inode)->root;
4347 4396
@@ -4351,7 +4400,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
4351 start = round_down(start, root->fs_info->sectorsize); 4400 start = round_down(start, root->fs_info->sectorsize);
4352 4401
4353 btrfs_free_reserved_data_space_noquota(inode, start, len); 4402 btrfs_free_reserved_data_space_noquota(inode, start, len);
4354 btrfs_qgroup_free_data(inode, start, len); 4403 btrfs_qgroup_free_data(inode, reserved, start, len);
4355} 4404}
4356 4405
4357static void force_metadata_allocation(struct btrfs_fs_info *info) 4406static void force_metadata_allocation(struct btrfs_fs_info *info)
@@ -4463,9 +4512,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
4463 } 4512 }
4464 4513
4465 if (left < thresh) { 4514 if (left < thresh) {
4466 u64 flags; 4515 u64 flags = btrfs_system_alloc_profile(fs_info);
4467 4516
4468 flags = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4469 /* 4517 /*
4470 * Ignore failure to create system chunk. We might end up not 4518 * Ignore failure to create system chunk. We might end up not
4471 * needing it, as we might not need to COW all nodes/leafs from 4519 * needing it, as we might not need to COW all nodes/leafs from
@@ -4506,10 +4554,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4506 4554
4507 space_info = __find_space_info(fs_info, flags); 4555 space_info = __find_space_info(fs_info, flags);
4508 if (!space_info) { 4556 if (!space_info) {
4509 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 4557 ret = create_space_info(fs_info, flags, &space_info);
4510 BUG_ON(ret); /* -ENOMEM */ 4558 if (ret)
4559 return ret;
4511 } 4560 }
4512 BUG_ON(!space_info); /* Logic error */
4513 4561
4514again: 4562again:
4515 spin_lock(&space_info->lock); 4563 spin_lock(&space_info->lock);
@@ -4614,11 +4662,11 @@ out:
4614 return ret; 4662 return ret;
4615} 4663}
4616 4664
4617static int can_overcommit(struct btrfs_root *root, 4665static int can_overcommit(struct btrfs_fs_info *fs_info,
4618 struct btrfs_space_info *space_info, u64 bytes, 4666 struct btrfs_space_info *space_info, u64 bytes,
4619 enum btrfs_reserve_flush_enum flush) 4667 enum btrfs_reserve_flush_enum flush,
4668 bool system_chunk)
4620{ 4669{
4621 struct btrfs_fs_info *fs_info = root->fs_info;
4622 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4670 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4623 u64 profile; 4671 u64 profile;
4624 u64 space_size; 4672 u64 space_size;
@@ -4629,7 +4677,11 @@ static int can_overcommit(struct btrfs_root *root,
4629 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 4677 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4630 return 0; 4678 return 0;
4631 4679
4632 profile = btrfs_get_alloc_profile(root, 0); 4680 if (system_chunk)
4681 profile = btrfs_system_alloc_profile(fs_info);
4682 else
4683 profile = btrfs_metadata_alloc_profile(fs_info);
4684
4633 used = btrfs_space_info_used(space_info, false); 4685 used = btrfs_space_info_used(space_info, false);
4634 4686
4635 /* 4687 /*
@@ -4646,9 +4698,7 @@ static int can_overcommit(struct btrfs_root *root,
4646 4698
4647 used += space_info->bytes_may_use; 4699 used += space_info->bytes_may_use;
4648 4700
4649 spin_lock(&fs_info->free_chunk_lock); 4701 avail = atomic64_read(&fs_info->free_chunk_space);
4650 avail = fs_info->free_chunk_space;
4651 spin_unlock(&fs_info->free_chunk_lock);
4652 4702
4653 /* 4703 /*
4654 * If we have dup, raid1 or raid10 then only half of the free 4704 * If we have dup, raid1 or raid10 then only half of the free
@@ -4698,14 +4748,14 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4698 } 4748 }
4699} 4749}
4700 4750
4701static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 4751static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4702 u64 to_reclaim) 4752 u64 to_reclaim)
4703{ 4753{
4704 u64 bytes; 4754 u64 bytes;
4705 int nr; 4755 u64 nr;
4706 4756
4707 bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 4757 bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4708 nr = (int)div64_u64(to_reclaim, bytes); 4758 nr = div64_u64(to_reclaim, bytes);
4709 if (!nr) 4759 if (!nr)
4710 nr = 1; 4760 nr = 1;
4711 return nr; 4761 return nr;
@@ -4716,24 +4766,23 @@ static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4716/* 4766/*
4717 * shrink metadata reservation for delalloc 4767 * shrink metadata reservation for delalloc
4718 */ 4768 */
4719static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 4769static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4720 bool wait_ordered) 4770 u64 orig, bool wait_ordered)
4721{ 4771{
4722 struct btrfs_fs_info *fs_info = root->fs_info;
4723 struct btrfs_block_rsv *block_rsv; 4772 struct btrfs_block_rsv *block_rsv;
4724 struct btrfs_space_info *space_info; 4773 struct btrfs_space_info *space_info;
4725 struct btrfs_trans_handle *trans; 4774 struct btrfs_trans_handle *trans;
4726 u64 delalloc_bytes; 4775 u64 delalloc_bytes;
4727 u64 max_reclaim; 4776 u64 max_reclaim;
4777 u64 items;
4728 long time_left; 4778 long time_left;
4729 unsigned long nr_pages; 4779 unsigned long nr_pages;
4730 int loops; 4780 int loops;
4731 int items;
4732 enum btrfs_reserve_flush_enum flush; 4781 enum btrfs_reserve_flush_enum flush;
4733 4782
4734 /* Calc the number of the pages we need flush for space reservation */ 4783 /* Calc the number of the pages we need flush for space reservation */
4735 items = calc_reclaim_items_nr(fs_info, to_reclaim); 4784 items = calc_reclaim_items_nr(fs_info, to_reclaim);
4736 to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; 4785 to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4737 4786
4738 trans = (struct btrfs_trans_handle *)current->journal_info; 4787 trans = (struct btrfs_trans_handle *)current->journal_info;
4739 block_rsv = &fs_info->delalloc_block_rsv; 4788 block_rsv = &fs_info->delalloc_block_rsv;
@@ -4776,7 +4825,7 @@ skip_async:
4776 else 4825 else
4777 flush = BTRFS_RESERVE_NO_FLUSH; 4826 flush = BTRFS_RESERVE_NO_FLUSH;
4778 spin_lock(&space_info->lock); 4827 spin_lock(&space_info->lock);
4779 if (can_overcommit(root, space_info, orig, flush)) { 4828 if (can_overcommit(fs_info, space_info, orig, flush, false)) {
4780 spin_unlock(&space_info->lock); 4829 spin_unlock(&space_info->lock);
4781 break; 4830 break;
4782 } 4831 }
@@ -4838,7 +4887,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4838 4887
4839 spin_lock(&delayed_rsv->lock); 4888 spin_lock(&delayed_rsv->lock);
4840 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4889 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4841 bytes - delayed_rsv->size) >= 0) { 4890 bytes - delayed_rsv->size) < 0) {
4842 spin_unlock(&delayed_rsv->lock); 4891 spin_unlock(&delayed_rsv->lock);
4843 return -ENOSPC; 4892 return -ENOSPC;
4844 } 4893 }
@@ -4886,7 +4935,7 @@ static int flush_space(struct btrfs_fs_info *fs_info,
4886 break; 4935 break;
4887 case FLUSH_DELALLOC: 4936 case FLUSH_DELALLOC:
4888 case FLUSH_DELALLOC_WAIT: 4937 case FLUSH_DELALLOC_WAIT:
4889 shrink_delalloc(root, num_bytes * 2, orig_bytes, 4938 shrink_delalloc(fs_info, num_bytes * 2, orig_bytes,
4890 state == FLUSH_DELALLOC_WAIT); 4939 state == FLUSH_DELALLOC_WAIT);
4891 break; 4940 break;
4892 case ALLOC_CHUNK: 4941 case ALLOC_CHUNK:
@@ -4896,7 +4945,7 @@ static int flush_space(struct btrfs_fs_info *fs_info,
4896 break; 4945 break;
4897 } 4946 }
4898 ret = do_chunk_alloc(trans, fs_info, 4947 ret = do_chunk_alloc(trans, fs_info,
4899 btrfs_get_alloc_profile(root, 0), 4948 btrfs_metadata_alloc_profile(fs_info),
4900 CHUNK_ALLOC_NO_FORCE); 4949 CHUNK_ALLOC_NO_FORCE);
4901 btrfs_end_transaction(trans); 4950 btrfs_end_transaction(trans);
4902 if (ret > 0 || ret == -ENOSPC) 4951 if (ret > 0 || ret == -ENOSPC)
@@ -4917,8 +4966,9 @@ static int flush_space(struct btrfs_fs_info *fs_info,
4917} 4966}
4918 4967
4919static inline u64 4968static inline u64
4920btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, 4969btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
4921 struct btrfs_space_info *space_info) 4970 struct btrfs_space_info *space_info,
4971 bool system_chunk)
4922{ 4972{
4923 struct reserve_ticket *ticket; 4973 struct reserve_ticket *ticket;
4924 u64 used; 4974 u64 used;
@@ -4933,14 +4983,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4933 return to_reclaim; 4983 return to_reclaim;
4934 4984
4935 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 4985 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4936 if (can_overcommit(root, space_info, to_reclaim, 4986 if (can_overcommit(fs_info, space_info, to_reclaim,
4937 BTRFS_RESERVE_FLUSH_ALL)) 4987 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4938 return 0; 4988 return 0;
4939 4989
4940 used = space_info->bytes_used + space_info->bytes_reserved + 4990 used = btrfs_space_info_used(space_info, true);
4941 space_info->bytes_pinned + space_info->bytes_readonly + 4991
4942 space_info->bytes_may_use; 4992 if (can_overcommit(fs_info, space_info, SZ_1M,
4943 if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) 4993 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4944 expected = div_factor_fine(space_info->total_bytes, 95); 4994 expected = div_factor_fine(space_info->total_bytes, 95);
4945 else 4995 else
4946 expected = div_factor_fine(space_info->total_bytes, 90); 4996 expected = div_factor_fine(space_info->total_bytes, 90);
@@ -4954,17 +5004,18 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4954 return to_reclaim; 5004 return to_reclaim;
4955} 5005}
4956 5006
4957static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, 5007static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
4958 struct btrfs_root *root, u64 used) 5008 struct btrfs_space_info *space_info,
5009 u64 used, bool system_chunk)
4959{ 5010{
4960 struct btrfs_fs_info *fs_info = root->fs_info;
4961 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 5011 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4962 5012
4963 /* If we're just plain full then async reclaim just slows us down. */ 5013 /* If we're just plain full then async reclaim just slows us down. */
4964 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 5014 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4965 return 0; 5015 return 0;
4966 5016
4967 if (!btrfs_calc_reclaim_metadata_size(root, space_info)) 5017 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5018 system_chunk))
4968 return 0; 5019 return 0;
4969 5020
4970 return (used >= thresh && !btrfs_fs_closing(fs_info) && 5021 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
@@ -5001,8 +5052,8 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
5001 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5052 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5002 5053
5003 spin_lock(&space_info->lock); 5054 spin_lock(&space_info->lock);
5004 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 5055 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5005 space_info); 5056 false);
5006 if (!to_reclaim) { 5057 if (!to_reclaim) {
5007 space_info->flush = 0; 5058 space_info->flush = 0;
5008 spin_unlock(&space_info->lock); 5059 spin_unlock(&space_info->lock);
@@ -5024,8 +5075,9 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
5024 spin_unlock(&space_info->lock); 5075 spin_unlock(&space_info->lock);
5025 return; 5076 return;
5026 } 5077 }
5027 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 5078 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
5028 space_info); 5079 space_info,
5080 false);
5029 ticket = list_first_entry(&space_info->tickets, 5081 ticket = list_first_entry(&space_info->tickets,
5030 struct reserve_ticket, list); 5082 struct reserve_ticket, list);
5031 if (last_tickets_id == space_info->tickets_id) { 5083 if (last_tickets_id == space_info->tickets_id) {
@@ -5063,8 +5115,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5063 int flush_state = FLUSH_DELAYED_ITEMS_NR; 5115 int flush_state = FLUSH_DELAYED_ITEMS_NR;
5064 5116
5065 spin_lock(&space_info->lock); 5117 spin_lock(&space_info->lock);
5066 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->extent_root, 5118 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5067 space_info); 5119 false);
5068 if (!to_reclaim) { 5120 if (!to_reclaim) {
5069 spin_unlock(&space_info->lock); 5121 spin_unlock(&space_info->lock);
5070 return; 5122 return;
@@ -5143,12 +5195,12 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5143 * regain reservations will be made and this will fail if there is not enough 5195 * regain reservations will be made and this will fail if there is not enough
5144 * space already. 5196 * space already.
5145 */ 5197 */
5146static int __reserve_metadata_bytes(struct btrfs_root *root, 5198static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5147 struct btrfs_space_info *space_info, 5199 struct btrfs_space_info *space_info,
5148 u64 orig_bytes, 5200 u64 orig_bytes,
5149 enum btrfs_reserve_flush_enum flush) 5201 enum btrfs_reserve_flush_enum flush,
5202 bool system_chunk)
5150{ 5203{
5151 struct btrfs_fs_info *fs_info = root->fs_info;
5152 struct reserve_ticket ticket; 5204 struct reserve_ticket ticket;
5153 u64 used; 5205 u64 used;
5154 int ret = 0; 5206 int ret = 0;
@@ -5170,7 +5222,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
5170 trace_btrfs_space_reservation(fs_info, "space_info", 5222 trace_btrfs_space_reservation(fs_info, "space_info",
5171 space_info->flags, orig_bytes, 1); 5223 space_info->flags, orig_bytes, 1);
5172 ret = 0; 5224 ret = 0;
5173 } else if (can_overcommit(root, space_info, orig_bytes, flush)) { 5225 } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5226 system_chunk)) {
5174 space_info->bytes_may_use += orig_bytes; 5227 space_info->bytes_may_use += orig_bytes;
5175 trace_btrfs_space_reservation(fs_info, "space_info", 5228 trace_btrfs_space_reservation(fs_info, "space_info",
5176 space_info->flags, orig_bytes, 1); 5229 space_info->flags, orig_bytes, 1);
@@ -5197,7 +5250,7 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
5197 orig_bytes, flush, 5250 orig_bytes, flush,
5198 "enospc"); 5251 "enospc");
5199 queue_work(system_unbound_wq, 5252 queue_work(system_unbound_wq,
5200 &root->fs_info->async_reclaim_work); 5253 &fs_info->async_reclaim_work);
5201 } 5254 }
5202 } else { 5255 } else {
5203 list_add_tail(&ticket.list, 5256 list_add_tail(&ticket.list,
@@ -5211,7 +5264,8 @@ static int __reserve_metadata_bytes(struct btrfs_root *root,
5211 * the async reclaim as we will panic. 5264 * the async reclaim as we will panic.
5212 */ 5265 */
5213 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 5266 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
5214 need_do_async_reclaim(space_info, root, used) && 5267 need_do_async_reclaim(fs_info, space_info,
5268 used, system_chunk) &&
5215 !work_busy(&fs_info->async_reclaim_work)) { 5269 !work_busy(&fs_info->async_reclaim_work)) {
5216 trace_btrfs_trigger_flush(fs_info, space_info->flags, 5270 trace_btrfs_trigger_flush(fs_info, space_info->flags,
5217 orig_bytes, flush, "preempt"); 5271 orig_bytes, flush, "preempt");
@@ -5269,9 +5323,10 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
5269 struct btrfs_fs_info *fs_info = root->fs_info; 5323 struct btrfs_fs_info *fs_info = root->fs_info;
5270 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5324 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5271 int ret; 5325 int ret;
5326 bool system_chunk = (root == fs_info->chunk_root);
5272 5327
5273 ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, 5328 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5274 flush); 5329 orig_bytes, flush, system_chunk);
5275 if (ret == -ENOSPC && 5330 if (ret == -ENOSPC &&
5276 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 5331 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5277 if (block_rsv != global_rsv && 5332 if (block_rsv != global_rsv &&
@@ -5380,9 +5435,7 @@ static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5380 * overcommit, and if we can't then we just need to free up our space 5435 * overcommit, and if we can't then we just need to free up our space
5381 * and not satisfy any requests. 5436 * and not satisfy any requests.
5382 */ 5437 */
5383 used = space_info->bytes_used + space_info->bytes_reserved + 5438 used = btrfs_space_info_used(space_info, true);
5384 space_info->bytes_pinned + space_info->bytes_readonly +
5385 space_info->bytes_may_use;
5386 if (used - num_bytes >= space_info->total_bytes) 5439 if (used - num_bytes >= space_info->total_bytes)
5387 check_overcommit = true; 5440 check_overcommit = true;
5388again: 5441again:
@@ -5394,8 +5447,7 @@ again:
5394 * adding the ticket space would be a double count. 5447 * adding the ticket space would be a double count.
5395 */ 5448 */
5396 if (check_overcommit && 5449 if (check_overcommit &&
5397 !can_overcommit(fs_info->extent_root, space_info, 0, 5450 !can_overcommit(fs_info, space_info, 0, flush, false))
5398 flush))
5399 break; 5451 break;
5400 if (num_bytes >= ticket->bytes) { 5452 if (num_bytes >= ticket->bytes) {
5401 list_del_init(&ticket->list); 5453 list_del_init(&ticket->list);
@@ -6124,6 +6176,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
6124 * @inode: inode we're writing to 6176 * @inode: inode we're writing to
6125 * @start: start range we are writing to 6177 * @start: start range we are writing to
6126 * @len: how long the range we are writing to 6178 * @len: how long the range we are writing to
6179 * @reserved: mandatory parameter, record actually reserved qgroup ranges of
6180 * current reservation.
6127 * 6181 *
6128 * This will do the following things 6182 * This will do the following things
6129 * 6183 *
@@ -6141,16 +6195,17 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
6141 * Return 0 for success 6195 * Return 0 for success
6142 * Return <0 for error(-ENOSPC or -EQUOT) 6196 * Return <0 for error(-ENOSPC or -EQUOT)
6143 */ 6197 */
6144int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) 6198int btrfs_delalloc_reserve_space(struct inode *inode,
6199 struct extent_changeset **reserved, u64 start, u64 len)
6145{ 6200{
6146 int ret; 6201 int ret;
6147 6202
6148 ret = btrfs_check_data_free_space(inode, start, len); 6203 ret = btrfs_check_data_free_space(inode, reserved, start, len);
6149 if (ret < 0) 6204 if (ret < 0)
6150 return ret; 6205 return ret;
6151 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); 6206 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
6152 if (ret < 0) 6207 if (ret < 0)
6153 btrfs_free_reserved_data_space(inode, start, len); 6208 btrfs_free_reserved_data_space(inode, *reserved, start, len);
6154 return ret; 6209 return ret;
6155} 6210}
6156 6211
@@ -6169,10 +6224,11 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
6169 * list if there are no delalloc bytes left. 6224 * list if there are no delalloc bytes left.
6170 * Also it will handle the qgroup reserved space. 6225 * Also it will handle the qgroup reserved space.
6171 */ 6226 */
6172void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) 6227void btrfs_delalloc_release_space(struct inode *inode,
6228 struct extent_changeset *reserved, u64 start, u64 len)
6173{ 6229{
6174 btrfs_delalloc_release_metadata(BTRFS_I(inode), len); 6230 btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
6175 btrfs_free_reserved_data_space(inode, start, len); 6231 btrfs_free_reserved_data_space(inode, reserved, start, len);
6176} 6232}
6177 6233
6178static int update_block_group(struct btrfs_trans_handle *trans, 6234static int update_block_group(struct btrfs_trans_handle *trans,
@@ -6248,6 +6304,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
6248 trace_btrfs_space_reservation(info, "pinned", 6304 trace_btrfs_space_reservation(info, "pinned",
6249 cache->space_info->flags, 6305 cache->space_info->flags,
6250 num_bytes, 1); 6306 num_bytes, 1);
6307 percpu_counter_add(&cache->space_info->total_bytes_pinned,
6308 num_bytes);
6251 set_extent_dirty(info->pinned_extents, 6309 set_extent_dirty(info->pinned_extents,
6252 bytenr, bytenr + num_bytes - 1, 6310 bytenr, bytenr + num_bytes - 1,
6253 GFP_NOFS | __GFP_NOFAIL); 6311 GFP_NOFS | __GFP_NOFAIL);
@@ -6324,6 +6382,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info,
6324 6382
6325 trace_btrfs_space_reservation(fs_info, "pinned", 6383 trace_btrfs_space_reservation(fs_info, "pinned",
6326 cache->space_info->flags, num_bytes, 1); 6384 cache->space_info->flags, num_bytes, 1);
6385 percpu_counter_add(&cache->space_info->total_bytes_pinned, num_bytes);
6327 set_extent_dirty(fs_info->pinned_extents, bytenr, 6386 set_extent_dirty(fs_info->pinned_extents, bytenr,
6328 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 6387 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6329 return 0; 6388 return 0;
@@ -6794,27 +6853,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
6794 return 0; 6853 return 0;
6795} 6854}
6796 6855
6797static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
6798 u64 owner, u64 root_objectid)
6799{
6800 struct btrfs_space_info *space_info;
6801 u64 flags;
6802
6803 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6804 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
6805 flags = BTRFS_BLOCK_GROUP_SYSTEM;
6806 else
6807 flags = BTRFS_BLOCK_GROUP_METADATA;
6808 } else {
6809 flags = BTRFS_BLOCK_GROUP_DATA;
6810 }
6811
6812 space_info = __find_space_info(fs_info, flags);
6813 BUG_ON(!space_info); /* Logic bug */
6814 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
6815}
6816
6817
6818static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 6856static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6819 struct btrfs_fs_info *info, 6857 struct btrfs_fs_info *info,
6820 struct btrfs_delayed_ref_node *node, u64 parent, 6858 struct btrfs_delayed_ref_node *node, u64 parent,
@@ -7037,8 +7075,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
7037 goto out; 7075 goto out;
7038 } 7076 }
7039 } 7077 }
7040 add_pinned_bytes(info, -num_bytes, owner_objectid,
7041 root_objectid);
7042 } else { 7078 } else {
7043 if (found_extent) { 7079 if (found_extent) {
7044 BUG_ON(is_data && refs_to_drop != 7080 BUG_ON(is_data && refs_to_drop !=
@@ -7170,19 +7206,19 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7170 int ret; 7206 int ret;
7171 7207
7172 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7208 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7173 ret = btrfs_add_delayed_tree_ref(fs_info, trans, 7209 int old_ref_mod, new_ref_mod;
7174 buf->start, buf->len, 7210
7175 parent, 7211 ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
7212 buf->len, parent,
7176 root->root_key.objectid, 7213 root->root_key.objectid,
7177 btrfs_header_level(buf), 7214 btrfs_header_level(buf),
7178 BTRFS_DROP_DELAYED_REF, NULL); 7215 BTRFS_DROP_DELAYED_REF, NULL,
7216 &old_ref_mod, &new_ref_mod);
7179 BUG_ON(ret); /* -ENOMEM */ 7217 BUG_ON(ret); /* -ENOMEM */
7218 pin = old_ref_mod >= 0 && new_ref_mod < 0;
7180 } 7219 }
7181 7220
7182 if (!last_ref) 7221 if (last_ref && btrfs_header_generation(buf) == trans->transid) {
7183 return;
7184
7185 if (btrfs_header_generation(buf) == trans->transid) {
7186 struct btrfs_block_group_cache *cache; 7222 struct btrfs_block_group_cache *cache;
7187 7223
7188 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7224 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -7191,6 +7227,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7191 goto out; 7227 goto out;
7192 } 7228 }
7193 7229
7230 pin = 0;
7194 cache = btrfs_lookup_block_group(fs_info, buf->start); 7231 cache = btrfs_lookup_block_group(fs_info, buf->start);
7195 7232
7196 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 7233 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
@@ -7206,18 +7243,19 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7206 btrfs_free_reserved_bytes(cache, buf->len, 0); 7243 btrfs_free_reserved_bytes(cache, buf->len, 0);
7207 btrfs_put_block_group(cache); 7244 btrfs_put_block_group(cache);
7208 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); 7245 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
7209 pin = 0;
7210 } 7246 }
7211out: 7247out:
7212 if (pin) 7248 if (pin)
7213 add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf), 7249 add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf),
7214 root->root_key.objectid); 7250 root->root_key.objectid);
7215 7251
7216 /* 7252 if (last_ref) {
7217 * Deleting the buffer, clear the corrupt flag since it doesn't matter 7253 /*
7218 * anymore. 7254 * Deleting the buffer, clear the corrupt flag since it doesn't
7219 */ 7255 * matter anymore.
7220 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 7256 */
7257 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7258 }
7221} 7259}
7222 7260
7223/* Can return -ENOMEM */ 7261/* Can return -ENOMEM */
@@ -7226,12 +7264,12 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
7226 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 7264 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7227 u64 owner, u64 offset) 7265 u64 owner, u64 offset)
7228{ 7266{
7267 int old_ref_mod, new_ref_mod;
7229 int ret; 7268 int ret;
7230 7269
7231 if (btrfs_is_testing(fs_info)) 7270 if (btrfs_is_testing(fs_info))
7232 return 0; 7271 return 0;
7233 7272
7234 add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
7235 7273
7236 /* 7274 /*
7237 * tree log blocks never actually go into the extent allocation 7275 * tree log blocks never actually go into the extent allocation
@@ -7241,19 +7279,25 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
7241 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 7279 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
7242 /* unlocks the pinned mutex */ 7280 /* unlocks the pinned mutex */
7243 btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); 7281 btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
7282 old_ref_mod = new_ref_mod = 0;
7244 ret = 0; 7283 ret = 0;
7245 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 7284 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7246 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 7285 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
7247 num_bytes, 7286 num_bytes, parent,
7248 parent, root_objectid, (int)owner, 7287 root_objectid, (int)owner,
7249 BTRFS_DROP_DELAYED_REF, NULL); 7288 BTRFS_DROP_DELAYED_REF, NULL,
7289 &old_ref_mod, &new_ref_mod);
7250 } else { 7290 } else {
7251 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 7291 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
7252 num_bytes, 7292 num_bytes, parent,
7253 parent, root_objectid, owner, 7293 root_objectid, owner, offset,
7254 offset, 0, 7294 0, BTRFS_DROP_DELAYED_REF,
7255 BTRFS_DROP_DELAYED_REF); 7295 &old_ref_mod, &new_ref_mod);
7256 } 7296 }
7297
7298 if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
7299 add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
7300
7257 return ret; 7301 return ret;
7258} 7302}
7259 7303
@@ -7956,7 +8000,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
7956 u64 flags; 8000 u64 flags;
7957 int ret; 8001 int ret;
7958 8002
7959 flags = btrfs_get_alloc_profile(root, is_data); 8003 flags = get_alloc_profile_by_root(root, is_data);
7960again: 8004again:
7961 WARN_ON(num_bytes < fs_info->sectorsize); 8005 WARN_ON(num_bytes < fs_info->sectorsize);
7962 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, 8006 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
@@ -8200,9 +8244,9 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8200 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 8244 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
8201 8245
8202 ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, 8246 ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
8203 ins->offset, 0, 8247 ins->offset, 0, root_objectid, owner,
8204 root_objectid, owner, offset, 8248 offset, ram_bytes,
8205 ram_bytes, BTRFS_ADD_DELAYED_EXTENT); 8249 BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
8206 return ret; 8250 return ret;
8207} 8251}
8208 8252
@@ -8422,11 +8466,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8422 extent_op->is_data = false; 8466 extent_op->is_data = false;
8423 extent_op->level = level; 8467 extent_op->level = level;
8424 8468
8425 ret = btrfs_add_delayed_tree_ref(fs_info, trans, 8469 ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
8426 ins.objectid, ins.offset, 8470 ins.offset, parent,
8427 parent, root_objectid, level, 8471 root_objectid, level,
8428 BTRFS_ADD_DELAYED_EXTENT, 8472 BTRFS_ADD_DELAYED_EXTENT,
8429 extent_op); 8473 extent_op, NULL, NULL);
8430 if (ret) 8474 if (ret)
8431 goto out_free_delayed; 8475 goto out_free_delayed;
8432 } 8476 }
@@ -10059,19 +10103,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
10059 } 10103 }
10060 10104
10061 trace_btrfs_add_block_group(info, cache, 0); 10105 trace_btrfs_add_block_group(info, cache, 0);
10062 ret = update_space_info(info, cache->flags, found_key.offset, 10106 update_space_info(info, cache->flags, found_key.offset,
10063 btrfs_block_group_used(&cache->item), 10107 btrfs_block_group_used(&cache->item),
10064 cache->bytes_super, &space_info); 10108 cache->bytes_super, &space_info);
10065 if (ret) {
10066 btrfs_remove_free_space_cache(cache);
10067 spin_lock(&info->block_group_cache_lock);
10068 rb_erase(&cache->cache_node,
10069 &info->block_group_cache_tree);
10070 RB_CLEAR_NODE(&cache->cache_node);
10071 spin_unlock(&info->block_group_cache_lock);
10072 btrfs_put_block_group(cache);
10073 goto error;
10074 }
10075 10109
10076 cache->space_info = space_info; 10110 cache->space_info = space_info;
10077 10111
@@ -10203,16 +10237,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
10203 } 10237 }
10204#endif 10238#endif
10205 /* 10239 /*
10206 * Call to ensure the corresponding space_info object is created and 10240 * Ensure the corresponding space_info object is created and
10207 * assigned to our block group, but don't update its counters just yet. 10241 * assigned to our block group. We want our bg to be added to the rbtree
10208 * We want our bg to be added to the rbtree with its ->space_info set. 10242 * with its ->space_info set.
10209 */ 10243 */
10210 ret = update_space_info(fs_info, cache->flags, 0, 0, 0, 10244 cache->space_info = __find_space_info(fs_info, cache->flags);
10211 &cache->space_info); 10245 if (!cache->space_info) {
10212 if (ret) { 10246 ret = create_space_info(fs_info, cache->flags,
10213 btrfs_remove_free_space_cache(cache); 10247 &cache->space_info);
10214 btrfs_put_block_group(cache); 10248 if (ret) {
10215 return ret; 10249 btrfs_remove_free_space_cache(cache);
10250 btrfs_put_block_group(cache);
10251 return ret;
10252 }
10216 } 10253 }
10217 10254
10218 ret = btrfs_add_block_group_cache(fs_info, cache); 10255 ret = btrfs_add_block_group_cache(fs_info, cache);
@@ -10227,18 +10264,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
10227 * the rbtree, update the space info's counters. 10264 * the rbtree, update the space info's counters.
10228 */ 10265 */
10229 trace_btrfs_add_block_group(fs_info, cache, 1); 10266 trace_btrfs_add_block_group(fs_info, cache, 1);
10230 ret = update_space_info(fs_info, cache->flags, size, bytes_used, 10267 update_space_info(fs_info, cache->flags, size, bytes_used,
10231 cache->bytes_super, &cache->space_info); 10268 cache->bytes_super, &cache->space_info);
10232 if (ret) {
10233 btrfs_remove_free_space_cache(cache);
10234 spin_lock(&fs_info->block_group_cache_lock);
10235 rb_erase(&cache->cache_node,
10236 &fs_info->block_group_cache_tree);
10237 RB_CLEAR_NODE(&cache->cache_node);
10238 spin_unlock(&fs_info->block_group_cache_lock);
10239 btrfs_put_block_group(cache);
10240 return ret;
10241 }
10242 update_global_block_rsv(fs_info); 10269 update_global_block_rsv(fs_info);
10243 10270
10244 __link_block_group(cache->space_info, cache); 10271 __link_block_group(cache->space_info, cache);
@@ -10786,21 +10813,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10786 mixed = 1; 10813 mixed = 1;
10787 10814
10788 flags = BTRFS_BLOCK_GROUP_SYSTEM; 10815 flags = BTRFS_BLOCK_GROUP_SYSTEM;
10789 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10816 ret = create_space_info(fs_info, flags, &space_info);
10790 if (ret) 10817 if (ret)
10791 goto out; 10818 goto out;
10792 10819
10793 if (mixed) { 10820 if (mixed) {
10794 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 10821 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10795 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10822 ret = create_space_info(fs_info, flags, &space_info);
10796 } else { 10823 } else {
10797 flags = BTRFS_BLOCK_GROUP_METADATA; 10824 flags = BTRFS_BLOCK_GROUP_METADATA;
10798 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10825 ret = create_space_info(fs_info, flags, &space_info);
10799 if (ret) 10826 if (ret)
10800 goto out; 10827 goto out;
10801 10828
10802 flags = BTRFS_BLOCK_GROUP_DATA; 10829 flags = BTRFS_BLOCK_GROUP_DATA;
10803 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10830 ret = create_space_info(fs_info, flags, &space_info);
10804 } 10831 }
10805out: 10832out:
10806 return ret; 10833 return ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d1cd60140817..7a18b5762ac9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -87,19 +87,9 @@ void btrfs_leak_debug_check(void)
87static inline void __btrfs_debug_check_extent_io_range(const char *caller, 87static inline void __btrfs_debug_check_extent_io_range(const char *caller,
88 struct extent_io_tree *tree, u64 start, u64 end) 88 struct extent_io_tree *tree, u64 start, u64 end)
89{ 89{
90 struct inode *inode; 90 if (tree->ops && tree->ops->check_extent_io_range)
91 u64 isize; 91 tree->ops->check_extent_io_range(tree->private_data, caller,
92 92 start, end);
93 if (!tree->mapping)
94 return;
95
96 inode = tree->mapping->host;
97 isize = i_size_read(inode);
98 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
99 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
100 "%s: ino %llu isize %llu odd range [%llu,%llu]",
101 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
102 }
103} 93}
104#else 94#else
105#define btrfs_leak_debug_add(new, head) do {} while (0) 95#define btrfs_leak_debug_add(new, head) do {} while (0)
@@ -154,9 +144,9 @@ static noinline void flush_write_bio(void *data);
154static inline struct btrfs_fs_info * 144static inline struct btrfs_fs_info *
155tree_fs_info(struct extent_io_tree *tree) 145tree_fs_info(struct extent_io_tree *tree)
156{ 146{
157 if (!tree->mapping) 147 if (tree->ops)
158 return NULL; 148 return tree->ops->tree_fs_info(tree->private_data);
159 return btrfs_sb(tree->mapping->host->i_sb); 149 return NULL;
160} 150}
161 151
162int __init extent_io_init(void) 152int __init extent_io_init(void)
@@ -214,13 +204,13 @@ void extent_io_exit(void)
214} 204}
215 205
216void extent_io_tree_init(struct extent_io_tree *tree, 206void extent_io_tree_init(struct extent_io_tree *tree,
217 struct address_space *mapping) 207 void *private_data)
218{ 208{
219 tree->state = RB_ROOT; 209 tree->state = RB_ROOT;
220 tree->ops = NULL; 210 tree->ops = NULL;
221 tree->dirty_bytes = 0; 211 tree->dirty_bytes = 0;
222 spin_lock_init(&tree->lock); 212 spin_lock_init(&tree->lock);
223 tree->mapping = mapping; 213 tree->private_data = private_data;
224} 214}
225 215
226static struct extent_state *alloc_extent_state(gfp_t mask) 216static struct extent_state *alloc_extent_state(gfp_t mask)
@@ -370,8 +360,7 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
370 struct extent_state *other) 360 struct extent_state *other)
371{ 361{
372 if (tree->ops && tree->ops->merge_extent_hook) 362 if (tree->ops && tree->ops->merge_extent_hook)
373 tree->ops->merge_extent_hook(tree->mapping->host, new, 363 tree->ops->merge_extent_hook(tree->private_data, new, other);
374 other);
375} 364}
376 365
377/* 366/*
@@ -422,15 +411,14 @@ static void set_state_cb(struct extent_io_tree *tree,
422 struct extent_state *state, unsigned *bits) 411 struct extent_state *state, unsigned *bits)
423{ 412{
424 if (tree->ops && tree->ops->set_bit_hook) 413 if (tree->ops && tree->ops->set_bit_hook)
425 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 414 tree->ops->set_bit_hook(tree->private_data, state, bits);
426} 415}
427 416
428static void clear_state_cb(struct extent_io_tree *tree, 417static void clear_state_cb(struct extent_io_tree *tree,
429 struct extent_state *state, unsigned *bits) 418 struct extent_state *state, unsigned *bits)
430{ 419{
431 if (tree->ops && tree->ops->clear_bit_hook) 420 if (tree->ops && tree->ops->clear_bit_hook)
432 tree->ops->clear_bit_hook(BTRFS_I(tree->mapping->host), 421 tree->ops->clear_bit_hook(tree->private_data, state, bits);
433 state, bits);
434} 422}
435 423
436static void set_state_bits(struct extent_io_tree *tree, 424static void set_state_bits(struct extent_io_tree *tree,
@@ -479,7 +467,7 @@ static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
479 u64 split) 467 u64 split)
480{ 468{
481 if (tree->ops && tree->ops->split_extent_hook) 469 if (tree->ops && tree->ops->split_extent_hook)
482 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 470 tree->ops->split_extent_hook(tree->private_data, orig, split);
483} 471}
484 472
485/* 473/*
@@ -1403,17 +1391,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1403 */ 1391 */
1404static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1392static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1405{ 1393{
1406 unsigned long index = start >> PAGE_SHIFT; 1394 tree->ops->set_range_writeback(tree->private_data, start, end);
1407 unsigned long end_index = end >> PAGE_SHIFT;
1408 struct page *page;
1409
1410 while (index <= end_index) {
1411 page = find_get_page(tree->mapping, index);
1412 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1413 set_page_writeback(page);
1414 put_page(page);
1415 index++;
1416 }
1417} 1395}
1418 1396
1419/* find the first state struct with 'bits' set after 'start', and 1397/* find the first state struct with 'bits' set after 'start', and
@@ -1962,11 +1940,12 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1962 SetPageUptodate(page); 1940 SetPageUptodate(page);
1963} 1941}
1964 1942
1965int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) 1943int free_io_failure(struct extent_io_tree *failure_tree,
1944 struct extent_io_tree *io_tree,
1945 struct io_failure_record *rec)
1966{ 1946{
1967 int ret; 1947 int ret;
1968 int err = 0; 1948 int err = 0;
1969 struct extent_io_tree *failure_tree = &inode->io_failure_tree;
1970 1949
1971 set_state_failrec(failure_tree, rec->start, NULL); 1950 set_state_failrec(failure_tree, rec->start, NULL);
1972 ret = clear_extent_bits(failure_tree, rec->start, 1951 ret = clear_extent_bits(failure_tree, rec->start,
@@ -1975,7 +1954,7 @@ int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec)
1975 if (ret) 1954 if (ret)
1976 err = ret; 1955 err = ret;
1977 1956
1978 ret = clear_extent_bits(&inode->io_tree, rec->start, 1957 ret = clear_extent_bits(io_tree, rec->start,
1979 rec->start + rec->len - 1, 1958 rec->start + rec->len - 1,
1980 EXTENT_DAMAGED); 1959 EXTENT_DAMAGED);
1981 if (ret && !err) 1960 if (ret && !err)
@@ -1995,11 +1974,10 @@ int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec)
1995 * currently, there can be no more than two copies of every data bit. thus, 1974 * currently, there can be no more than two copies of every data bit. thus,
1996 * exactly one rewrite is required. 1975 * exactly one rewrite is required.
1997 */ 1976 */
1998int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, 1977int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
1999 u64 logical, struct page *page, 1978 u64 length, u64 logical, struct page *page,
2000 unsigned int pg_offset, int mirror_num) 1979 unsigned int pg_offset, int mirror_num)
2001{ 1980{
2002 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2003 struct bio *bio; 1981 struct bio *bio;
2004 struct btrfs_device *dev; 1982 struct btrfs_device *dev;
2005 u64 map_length = 0; 1983 u64 map_length = 0;
@@ -2010,9 +1988,7 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
2010 ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); 1988 ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
2011 BUG_ON(!mirror_num); 1989 BUG_ON(!mirror_num);
2012 1990
2013 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 1991 bio = btrfs_io_bio_alloc(1);
2014 if (!bio)
2015 return -EIO;
2016 bio->bi_iter.bi_size = 0; 1992 bio->bi_iter.bi_size = 0;
2017 map_length = length; 1993 map_length = length;
2018 1994
@@ -2071,7 +2047,7 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
2071 2047
2072 btrfs_info_rl_in_rcu(fs_info, 2048 btrfs_info_rl_in_rcu(fs_info,
2073 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 2049 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
2074 btrfs_ino(inode), start, 2050 ino, start,
2075 rcu_str_deref(dev->name), sector); 2051 rcu_str_deref(dev->name), sector);
2076 btrfs_bio_counter_dec(fs_info); 2052 btrfs_bio_counter_dec(fs_info);
2077 bio_put(bio); 2053 bio_put(bio);
@@ -2091,8 +2067,7 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
2091 for (i = 0; i < num_pages; i++) { 2067 for (i = 0; i < num_pages; i++) {
2092 struct page *p = eb->pages[i]; 2068 struct page *p = eb->pages[i];
2093 2069
2094 ret = repair_io_failure(BTRFS_I(fs_info->btree_inode), start, 2070 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
2095 PAGE_SIZE, start, p,
2096 start - page_offset(p), mirror_num); 2071 start - page_offset(p), mirror_num);
2097 if (ret) 2072 if (ret)
2098 break; 2073 break;
@@ -2106,24 +2081,24 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
2106 * each time an IO finishes, we do a fast check in the IO failure tree 2081 * each time an IO finishes, we do a fast check in the IO failure tree
2107 * to see if we need to process or clean up an io_failure_record 2082 * to see if we need to process or clean up an io_failure_record
2108 */ 2083 */
2109int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, 2084int clean_io_failure(struct btrfs_fs_info *fs_info,
2110 unsigned int pg_offset) 2085 struct extent_io_tree *failure_tree,
2086 struct extent_io_tree *io_tree, u64 start,
2087 struct page *page, u64 ino, unsigned int pg_offset)
2111{ 2088{
2112 u64 private; 2089 u64 private;
2113 struct io_failure_record *failrec; 2090 struct io_failure_record *failrec;
2114 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2115 struct extent_state *state; 2091 struct extent_state *state;
2116 int num_copies; 2092 int num_copies;
2117 int ret; 2093 int ret;
2118 2094
2119 private = 0; 2095 private = 0;
2120 ret = count_range_bits(&inode->io_failure_tree, &private, 2096 ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2121 (u64)-1, 1, EXTENT_DIRTY, 0); 2097 EXTENT_DIRTY, 0);
2122 if (!ret) 2098 if (!ret)
2123 return 0; 2099 return 0;
2124 2100
2125 ret = get_state_failrec(&inode->io_failure_tree, start, 2101 ret = get_state_failrec(failure_tree, start, &failrec);
2126 &failrec);
2127 if (ret) 2102 if (ret)
2128 return 0; 2103 return 0;
2129 2104
@@ -2139,25 +2114,25 @@ int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page,
2139 if (fs_info->sb->s_flags & MS_RDONLY) 2114 if (fs_info->sb->s_flags & MS_RDONLY)
2140 goto out; 2115 goto out;
2141 2116
2142 spin_lock(&inode->io_tree.lock); 2117 spin_lock(&io_tree->lock);
2143 state = find_first_extent_bit_state(&inode->io_tree, 2118 state = find_first_extent_bit_state(io_tree,
2144 failrec->start, 2119 failrec->start,
2145 EXTENT_LOCKED); 2120 EXTENT_LOCKED);
2146 spin_unlock(&inode->io_tree.lock); 2121 spin_unlock(&io_tree->lock);
2147 2122
2148 if (state && state->start <= failrec->start && 2123 if (state && state->start <= failrec->start &&
2149 state->end >= failrec->start + failrec->len - 1) { 2124 state->end >= failrec->start + failrec->len - 1) {
2150 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2125 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2151 failrec->len); 2126 failrec->len);
2152 if (num_copies > 1) { 2127 if (num_copies > 1) {
2153 repair_io_failure(inode, start, failrec->len, 2128 repair_io_failure(fs_info, ino, start, failrec->len,
2154 failrec->logical, page, 2129 failrec->logical, page, pg_offset,
2155 pg_offset, failrec->failed_mirror); 2130 failrec->failed_mirror);
2156 } 2131 }
2157 } 2132 }
2158 2133
2159out: 2134out:
2160 free_io_failure(inode, failrec); 2135 free_io_failure(failure_tree, io_tree, failrec);
2161 2136
2162 return 0; 2137 return 0;
2163} 2138}
@@ -2357,10 +2332,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
2357 struct btrfs_io_bio *btrfs_failed_bio; 2332 struct btrfs_io_bio *btrfs_failed_bio;
2358 struct btrfs_io_bio *btrfs_bio; 2333 struct btrfs_io_bio *btrfs_bio;
2359 2334
2360 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2335 bio = btrfs_io_bio_alloc(1);
2361 if (!bio)
2362 return NULL;
2363
2364 bio->bi_end_io = endio_func; 2336 bio->bi_end_io = endio_func;
2365 bio->bi_iter.bi_sector = failrec->logical >> 9; 2337 bio->bi_iter.bi_sector = failrec->logical >> 9;
2366 bio->bi_bdev = fs_info->fs_devices->latest_bdev; 2338 bio->bi_bdev = fs_info->fs_devices->latest_bdev;
@@ -2398,6 +2370,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2398 struct io_failure_record *failrec; 2370 struct io_failure_record *failrec;
2399 struct inode *inode = page->mapping->host; 2371 struct inode *inode = page->mapping->host;
2400 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2372 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2373 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2401 struct bio *bio; 2374 struct bio *bio;
2402 int read_mode = 0; 2375 int read_mode = 0;
2403 blk_status_t status; 2376 blk_status_t status;
@@ -2411,7 +2384,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2411 2384
2412 ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); 2385 ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
2413 if (!ret) { 2386 if (!ret) {
2414 free_io_failure(BTRFS_I(inode), failrec); 2387 free_io_failure(failure_tree, tree, failrec);
2415 return -EIO; 2388 return -EIO;
2416 } 2389 }
2417 2390
@@ -2424,7 +2397,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2424 (int)phy_offset, failed_bio->bi_end_io, 2397 (int)phy_offset, failed_bio->bi_end_io,
2425 NULL); 2398 NULL);
2426 if (!bio) { 2399 if (!bio) {
2427 free_io_failure(BTRFS_I(inode), failrec); 2400 free_io_failure(failure_tree, tree, failrec);
2428 return -EIO; 2401 return -EIO;
2429 } 2402 }
2430 bio_set_op_attrs(bio, REQ_OP_READ, read_mode); 2403 bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
@@ -2433,10 +2406,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2433 "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", 2406 "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
2434 read_mode, failrec->this_mirror, failrec->in_validation); 2407 read_mode, failrec->this_mirror, failrec->in_validation);
2435 2408
2436 status = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, 2409 status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
2437 failrec->bio_flags, 0); 2410 failrec->bio_flags, 0);
2438 if (status) { 2411 if (status) {
2439 free_io_failure(BTRFS_I(inode), failrec); 2412 free_io_failure(failure_tree, tree, failrec);
2440 bio_put(bio); 2413 bio_put(bio);
2441 ret = blk_status_to_errno(status); 2414 ret = blk_status_to_errno(status);
2442 } 2415 }
@@ -2542,7 +2515,7 @@ static void end_bio_extent_readpage(struct bio *bio)
2542 struct bio_vec *bvec; 2515 struct bio_vec *bvec;
2543 int uptodate = !bio->bi_status; 2516 int uptodate = !bio->bi_status;
2544 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2517 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2545 struct extent_io_tree *tree; 2518 struct extent_io_tree *tree, *failure_tree;
2546 u64 offset = 0; 2519 u64 offset = 0;
2547 u64 start; 2520 u64 start;
2548 u64 end; 2521 u64 end;
@@ -2563,6 +2536,7 @@ static void end_bio_extent_readpage(struct bio *bio)
2563 (u64)bio->bi_iter.bi_sector, bio->bi_status, 2536 (u64)bio->bi_iter.bi_sector, bio->bi_status,
2564 io_bio->mirror_num); 2537 io_bio->mirror_num);
2565 tree = &BTRFS_I(inode)->io_tree; 2538 tree = &BTRFS_I(inode)->io_tree;
2539 failure_tree = &BTRFS_I(inode)->io_failure_tree;
2566 2540
2567 /* We always issue full-page reads, but if some block 2541 /* We always issue full-page reads, but if some block
2568 * in a page fails to read, blk_update_request() will 2542 * in a page fails to read, blk_update_request() will
@@ -2592,8 +2566,10 @@ static void end_bio_extent_readpage(struct bio *bio)
2592 if (ret) 2566 if (ret)
2593 uptodate = 0; 2567 uptodate = 0;
2594 else 2568 else
2595 clean_io_failure(BTRFS_I(inode), start, 2569 clean_io_failure(BTRFS_I(inode)->root->fs_info,
2596 page, 0); 2570 failure_tree, tree, start,
2571 page,
2572 btrfs_ino(BTRFS_I(inode)), 0);
2597 } 2573 }
2598 2574
2599 if (likely(uptodate)) 2575 if (likely(uptodate))
@@ -2682,67 +2658,70 @@ readpage_ok:
2682} 2658}
2683 2659
2684/* 2660/*
2685 * this allocates from the btrfs_bioset. We're returning a bio right now 2661 * Initialize the members up to but not including 'bio'. Use after allocating a
2686 * but you can call btrfs_io_bio for the appropriate container_of magic 2662 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
2663 * 'bio' because use of __GFP_ZERO is not supported.
2687 */ 2664 */
2688struct bio * 2665static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
2689btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2690 gfp_t gfp_flags)
2691{ 2666{
2692 struct btrfs_io_bio *btrfs_bio; 2667 memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
2693 struct bio *bio; 2668}
2694
2695 bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
2696 2669
2697 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2670/*
2698 while (!bio && (nr_vecs /= 2)) { 2671 * The following helpers allocate a bio. As it's backed by a bioset, it'll
2699 bio = bio_alloc_bioset(gfp_flags, 2672 * never fail. We're returning a bio right now but you can call btrfs_io_bio
2700 nr_vecs, btrfs_bioset); 2673 * for the appropriate container_of magic
2701 } 2674 */
2702 } 2675struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte)
2676{
2677 struct bio *bio;
2703 2678
2704 if (bio) { 2679 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset);
2705 bio->bi_bdev = bdev; 2680 bio->bi_bdev = bdev;
2706 bio->bi_iter.bi_sector = first_sector; 2681 bio->bi_iter.bi_sector = first_byte >> 9;
2707 btrfs_bio = btrfs_io_bio(bio); 2682 btrfs_io_bio_init(btrfs_io_bio(bio));
2708 btrfs_bio->csum = NULL;
2709 btrfs_bio->csum_allocated = NULL;
2710 btrfs_bio->end_io = NULL;
2711 }
2712 return bio; 2683 return bio;
2713} 2684}
2714 2685
2715struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) 2686struct bio *btrfs_bio_clone(struct bio *bio)
2716{ 2687{
2717 struct btrfs_io_bio *btrfs_bio; 2688 struct btrfs_io_bio *btrfs_bio;
2718 struct bio *new; 2689 struct bio *new;
2719 2690
2720 new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); 2691 /* Bio allocation backed by a bioset does not fail */
2721 if (new) { 2692 new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset);
2722 btrfs_bio = btrfs_io_bio(new); 2693 btrfs_bio = btrfs_io_bio(new);
2723 btrfs_bio->csum = NULL; 2694 btrfs_io_bio_init(btrfs_bio);
2724 btrfs_bio->csum_allocated = NULL; 2695 btrfs_bio->iter = bio->bi_iter;
2725 btrfs_bio->end_io = NULL;
2726 }
2727 return new; 2696 return new;
2728} 2697}
2729 2698
2730/* this also allocates from the btrfs_bioset */ 2699struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
2731struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
2732{ 2700{
2733 struct btrfs_io_bio *btrfs_bio;
2734 struct bio *bio; 2701 struct bio *bio;
2735 2702
2736 bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); 2703 /* Bio allocation backed by a bioset does not fail */
2737 if (bio) { 2704 bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset);
2738 btrfs_bio = btrfs_io_bio(bio); 2705 btrfs_io_bio_init(btrfs_io_bio(bio));
2739 btrfs_bio->csum = NULL;
2740 btrfs_bio->csum_allocated = NULL;
2741 btrfs_bio->end_io = NULL;
2742 }
2743 return bio; 2706 return bio;
2744} 2707}
2745 2708
2709struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
2710{
2711 struct bio *bio;
2712 struct btrfs_io_bio *btrfs_bio;
2713
2714 /* this will never fail when it's backed by a bioset */
2715 bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset);
2716 ASSERT(bio);
2717
2718 btrfs_bio = btrfs_io_bio(bio);
2719 btrfs_io_bio_init(btrfs_bio);
2720
2721 bio_trim(bio, offset >> 9, size >> 9);
2722 btrfs_bio->iter = bio->bi_iter;
2723 return bio;
2724}
2746 2725
2747static int __must_check submit_one_bio(struct bio *bio, int mirror_num, 2726static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
2748 unsigned long bio_flags) 2727 unsigned long bio_flags)
@@ -2759,7 +2738,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
2759 bio_get(bio); 2738 bio_get(bio);
2760 2739
2761 if (tree->ops) 2740 if (tree->ops)
2762 ret = tree->ops->submit_bio_hook(page->mapping->host, bio, 2741 ret = tree->ops->submit_bio_hook(tree->private_data, bio,
2763 mirror_num, bio_flags, start); 2742 mirror_num, bio_flags, start);
2764 else 2743 else
2765 btrfsic_submit_bio(bio); 2744 btrfsic_submit_bio(bio);
@@ -2822,11 +2801,7 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree,
2822 } 2801 }
2823 } 2802 }
2824 2803
2825 bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES, 2804 bio = btrfs_bio_alloc(bdev, sector << 9);
2826 GFP_NOFS | __GFP_HIGH);
2827 if (!bio)
2828 return -ENOMEM;
2829
2830 bio_add_page(bio, page, page_size, offset); 2805 bio_add_page(bio, page, page_size, offset);
2831 bio->bi_end_io = end_io_func; 2806 bio->bi_end_io = end_io_func;
2832 bio->bi_private = tree; 2807 bio->bi_private = tree;
@@ -3762,7 +3737,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3762 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 3737 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
3763 */ 3738 */
3764 start = btrfs_item_nr_offset(nritems); 3739 start = btrfs_item_nr_offset(nritems);
3765 end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb); 3740 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb);
3766 memzero_extent_buffer(eb, start, end - start); 3741 memzero_extent_buffer(eb, start, end - start);
3767 } 3742 }
3768 3743
@@ -4468,29 +4443,25 @@ try_submit_last:
4468} 4443}
4469 4444
4470/* 4445/*
4471 * Sanity check for fiemap cache 4446 * Emit last fiemap cache
4472 * 4447 *
4473 * All fiemap cache should be submitted by emit_fiemap_extent() 4448 * The last fiemap cache may still be cached in the following case:
4474 * Iteration should be terminated either by last fiemap extent or 4449 * 0 4k 8k
4475 * fieinfo->fi_extents_max. 4450 * |<- Fiemap range ->|
4476 * So no cached fiemap should exist. 4451 * |<------------ First extent ----------->|
4452 *
4453 * In this case, the first extent range will be cached but not emitted.
4454 * So we must emit it before ending extent_fiemap().
4477 */ 4455 */
4478static int check_fiemap_cache(struct btrfs_fs_info *fs_info, 4456static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info,
4479 struct fiemap_extent_info *fieinfo, 4457 struct fiemap_extent_info *fieinfo,
4480 struct fiemap_cache *cache) 4458 struct fiemap_cache *cache)
4481{ 4459{
4482 int ret; 4460 int ret;
4483 4461
4484 if (!cache->cached) 4462 if (!cache->cached)
4485 return 0; 4463 return 0;
4486 4464
4487 /* Small and recoverbale problem, only to info developer */
4488#ifdef CONFIG_BTRFS_DEBUG
4489 WARN_ON(1);
4490#endif
4491 btrfs_warn(fs_info,
4492 "unhandled fiemap cache detected: offset=%llu phys=%llu len=%llu flags=0x%x",
4493 cache->offset, cache->phys, cache->len, cache->flags);
4494 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 4465 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4495 cache->len, cache->flags); 4466 cache->len, cache->flags);
4496 cache->cached = false; 4467 cache->cached = false;
@@ -4706,7 +4677,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4706 } 4677 }
4707out_free: 4678out_free:
4708 if (!ret) 4679 if (!ret)
4709 ret = check_fiemap_cache(root->fs_info, fieinfo, &cache); 4680 ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache);
4710 free_extent_map(em); 4681 free_extent_map(em);
4711out: 4682out:
4712 btrfs_free_path(path); 4683 btrfs_free_path(path);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 487ca0207cb6..3fb8513bf02e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -92,9 +92,9 @@ struct btrfs_inode;
92struct btrfs_io_bio; 92struct btrfs_io_bio;
93struct io_failure_record; 93struct io_failure_record;
94 94
95typedef blk_status_t (extent_submit_bio_hook_t)(struct inode *inode, 95typedef blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio,
96 struct bio *bio, int mirror_num, unsigned long bio_flags, 96 int mirror_num, unsigned long bio_flags,
97 u64 bio_offset); 97 u64 bio_offset);
98struct extent_io_ops { 98struct extent_io_ops {
99 /* 99 /*
100 * The following callbacks must be allways defined, the function 100 * The following callbacks must be allways defined, the function
@@ -108,32 +108,36 @@ struct extent_io_ops {
108 size_t size, struct bio *bio, 108 size_t size, struct bio *bio,
109 unsigned long bio_flags); 109 unsigned long bio_flags);
110 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); 110 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
111 struct btrfs_fs_info *(*tree_fs_info)(void *private_data);
112 void (*set_range_writeback)(void *private_data, u64 start, u64 end);
111 113
112 /* 114 /*
113 * Optional hooks, called if the pointer is not NULL 115 * Optional hooks, called if the pointer is not NULL
114 */ 116 */
115 int (*fill_delalloc)(struct inode *inode, struct page *locked_page, 117 int (*fill_delalloc)(void *private_data, struct page *locked_page,
116 u64 start, u64 end, int *page_started, 118 u64 start, u64 end, int *page_started,
117 unsigned long *nr_written); 119 unsigned long *nr_written);
118 120
119 int (*writepage_start_hook)(struct page *page, u64 start, u64 end); 121 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
120 void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 122 void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
121 struct extent_state *state, int uptodate); 123 struct extent_state *state, int uptodate);
122 void (*set_bit_hook)(struct inode *inode, struct extent_state *state, 124 void (*set_bit_hook)(void *private_data, struct extent_state *state,
123 unsigned *bits); 125 unsigned *bits);
124 void (*clear_bit_hook)(struct btrfs_inode *inode, 126 void (*clear_bit_hook)(void *private_data,
125 struct extent_state *state, 127 struct extent_state *state,
126 unsigned *bits); 128 unsigned *bits);
127 void (*merge_extent_hook)(struct inode *inode, 129 void (*merge_extent_hook)(void *private_data,
128 struct extent_state *new, 130 struct extent_state *new,
129 struct extent_state *other); 131 struct extent_state *other);
130 void (*split_extent_hook)(struct inode *inode, 132 void (*split_extent_hook)(void *private_data,
131 struct extent_state *orig, u64 split); 133 struct extent_state *orig, u64 split);
134 void (*check_extent_io_range)(void *private_data, const char *caller,
135 u64 start, u64 end);
132}; 136};
133 137
134struct extent_io_tree { 138struct extent_io_tree {
135 struct rb_root state; 139 struct rb_root state;
136 struct address_space *mapping; 140 void *private_data;
137 u64 dirty_bytes; 141 u64 dirty_bytes;
138 int track_uptodate; 142 int track_uptodate;
139 spinlock_t lock; 143 spinlock_t lock;
@@ -205,12 +209,46 @@ struct extent_buffer {
205 */ 209 */
206struct extent_changeset { 210struct extent_changeset {
207 /* How many bytes are set/cleared in this operation */ 211 /* How many bytes are set/cleared in this operation */
208 u64 bytes_changed; 212 unsigned int bytes_changed;
209 213
210 /* Changed ranges */ 214 /* Changed ranges */
211 struct ulist range_changed; 215 struct ulist range_changed;
212}; 216};
213 217
218static inline void extent_changeset_init(struct extent_changeset *changeset)
219{
220 changeset->bytes_changed = 0;
221 ulist_init(&changeset->range_changed);
222}
223
224static inline struct extent_changeset *extent_changeset_alloc(void)
225{
226 struct extent_changeset *ret;
227
228 ret = kmalloc(sizeof(*ret), GFP_KERNEL);
229 if (!ret)
230 return NULL;
231
232 extent_changeset_init(ret);
233 return ret;
234}
235
236static inline void extent_changeset_release(struct extent_changeset *changeset)
237{
238 if (!changeset)
239 return;
240 changeset->bytes_changed = 0;
241 ulist_release(&changeset->range_changed);
242}
243
244static inline void extent_changeset_free(struct extent_changeset *changeset)
245{
246 if (!changeset)
247 return;
248 extent_changeset_release(changeset);
249 kfree(changeset);
250}
251
214static inline void extent_set_compress_type(unsigned long *bio_flags, 252static inline void extent_set_compress_type(unsigned long *bio_flags,
215 int compress_type) 253 int compress_type)
216{ 254{
@@ -230,8 +268,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
230 u64 start, u64 len, 268 u64 start, u64 len,
231 int create); 269 int create);
232 270
233void extent_io_tree_init(struct extent_io_tree *tree, 271void extent_io_tree_init(struct extent_io_tree *tree, void *private_data);
234 struct address_space *mapping);
235int try_release_extent_mapping(struct extent_map_tree *map, 272int try_release_extent_mapping(struct extent_map_tree *map,
236 struct extent_io_tree *tree, struct page *page, 273 struct extent_io_tree *tree, struct page *page,
237 gfp_t mask); 274 gfp_t mask);
@@ -459,20 +496,21 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
459 u64 delalloc_end, struct page *locked_page, 496 u64 delalloc_end, struct page *locked_page,
460 unsigned bits_to_clear, 497 unsigned bits_to_clear,
461 unsigned long page_ops); 498 unsigned long page_ops);
462struct bio * 499struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte);
463btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 500struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
464 gfp_t gfp_flags); 501struct bio *btrfs_bio_clone(struct bio *bio);
465struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); 502struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
466struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
467 503
468struct btrfs_fs_info; 504struct btrfs_fs_info;
469struct btrfs_inode; 505struct btrfs_inode;
470 506
471int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, 507int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
472 u64 logical, struct page *page, 508 u64 length, u64 logical, struct page *page,
473 unsigned int pg_offset, int mirror_num); 509 unsigned int pg_offset, int mirror_num);
474int clean_io_failure(struct btrfs_inode *inode, u64 start, 510int clean_io_failure(struct btrfs_fs_info *fs_info,
475 struct page *page, unsigned int pg_offset); 511 struct extent_io_tree *failure_tree,
512 struct extent_io_tree *io_tree, u64 start,
513 struct page *page, u64 ino, unsigned int pg_offset);
476void end_extent_writepage(struct page *page, int err, u64 start, u64 end); 514void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
477int repair_eb_io_failure(struct btrfs_fs_info *fs_info, 515int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
478 struct extent_buffer *eb, int mirror_num); 516 struct extent_buffer *eb, int mirror_num);
@@ -507,7 +545,9 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
507 struct io_failure_record *failrec, 545 struct io_failure_record *failrec,
508 struct page *page, int pg_offset, int icsum, 546 struct page *page, int pg_offset, int icsum,
509 bio_end_io_t *endio_func, void *data); 547 bio_end_io_t *endio_func, void *data);
510int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec); 548int free_io_failure(struct extent_io_tree *failure_tree,
549 struct extent_io_tree *io_tree,
550 struct io_failure_record *rec);
511#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 551#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
512noinline u64 find_lock_delalloc_range(struct inode *inode, 552noinline u64 find_lock_delalloc_range(struct inode *inode,
513 struct extent_io_tree *tree, 553 struct extent_io_tree *tree,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 5b1c7090e546..fdcb41002623 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -164,7 +164,8 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
164 u64 logical_offset, u32 *dst, int dio) 164 u64 logical_offset, u32 *dst, int dio)
165{ 165{
166 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 166 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
167 struct bio_vec *bvec; 167 struct bio_vec bvec;
168 struct bvec_iter iter;
168 struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); 169 struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
169 struct btrfs_csum_item *item = NULL; 170 struct btrfs_csum_item *item = NULL;
170 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 171 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -177,7 +178,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
177 u64 page_bytes_left; 178 u64 page_bytes_left;
178 u32 diff; 179 u32 diff;
179 int nblocks; 180 int nblocks;
180 int count = 0, i; 181 int count = 0;
181 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 182 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
182 183
183 path = btrfs_alloc_path(); 184 path = btrfs_alloc_path();
@@ -206,8 +207,6 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
206 if (bio->bi_iter.bi_size > PAGE_SIZE * 8) 207 if (bio->bi_iter.bi_size > PAGE_SIZE * 8)
207 path->reada = READA_FORWARD; 208 path->reada = READA_FORWARD;
208 209
209 WARN_ON(bio->bi_vcnt <= 0);
210
211 /* 210 /*
212 * the free space stuff is only read when it hasn't been 211 * the free space stuff is only read when it hasn't been
213 * updated in the current transaction. So, we can safely 212 * updated in the current transaction. So, we can safely
@@ -223,13 +222,13 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
223 if (dio) 222 if (dio)
224 offset = logical_offset; 223 offset = logical_offset;
225 224
226 bio_for_each_segment_all(bvec, bio, i) { 225 bio_for_each_segment(bvec, bio, iter) {
227 page_bytes_left = bvec->bv_len; 226 page_bytes_left = bvec.bv_len;
228 if (count) 227 if (count)
229 goto next; 228 goto next;
230 229
231 if (!dio) 230 if (!dio)
232 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 231 offset = page_offset(bvec.bv_page) + bvec.bv_offset;
233 count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, 232 count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
234 (u32 *)csum, nblocks); 233 (u32 *)csum, nblocks);
235 if (count) 234 if (count)
@@ -440,15 +439,15 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
440 struct btrfs_ordered_sum *sums; 439 struct btrfs_ordered_sum *sums;
441 struct btrfs_ordered_extent *ordered = NULL; 440 struct btrfs_ordered_extent *ordered = NULL;
442 char *data; 441 char *data;
443 struct bio_vec *bvec; 442 struct bvec_iter iter;
443 struct bio_vec bvec;
444 int index; 444 int index;
445 int nr_sectors; 445 int nr_sectors;
446 int i, j;
447 unsigned long total_bytes = 0; 446 unsigned long total_bytes = 0;
448 unsigned long this_sum_bytes = 0; 447 unsigned long this_sum_bytes = 0;
448 int i;
449 u64 offset; 449 u64 offset;
450 450
451 WARN_ON(bio->bi_vcnt <= 0);
452 sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), 451 sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
453 GFP_NOFS); 452 GFP_NOFS);
454 if (!sums) 453 if (!sums)
@@ -465,19 +464,19 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
465 sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; 464 sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
466 index = 0; 465 index = 0;
467 466
468 bio_for_each_segment_all(bvec, bio, j) { 467 bio_for_each_segment(bvec, bio, iter) {
469 if (!contig) 468 if (!contig)
470 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 469 offset = page_offset(bvec.bv_page) + bvec.bv_offset;
471 470
472 if (!ordered) { 471 if (!ordered) {
473 ordered = btrfs_lookup_ordered_extent(inode, offset); 472 ordered = btrfs_lookup_ordered_extent(inode, offset);
474 BUG_ON(!ordered); /* Logic error */ 473 BUG_ON(!ordered); /* Logic error */
475 } 474 }
476 475
477 data = kmap_atomic(bvec->bv_page); 476 data = kmap_atomic(bvec.bv_page);
478 477
479 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, 478 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info,
480 bvec->bv_len + fs_info->sectorsize 479 bvec.bv_len + fs_info->sectorsize
481 - 1); 480 - 1);
482 481
483 for (i = 0; i < nr_sectors; i++) { 482 for (i = 0; i < nr_sectors; i++) {
@@ -504,12 +503,12 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
504 + total_bytes; 503 + total_bytes;
505 index = 0; 504 index = 0;
506 505
507 data = kmap_atomic(bvec->bv_page); 506 data = kmap_atomic(bvec.bv_page);
508 } 507 }
509 508
510 sums->sums[index] = ~(u32)0; 509 sums->sums[index] = ~(u32)0;
511 sums->sums[index] 510 sums->sums[index]
512 = btrfs_csum_data(data + bvec->bv_offset 511 = btrfs_csum_data(data + bvec.bv_offset
513 + (i * fs_info->sectorsize), 512 + (i * fs_info->sectorsize),
514 sums->sums[index], 513 sums->sums[index],
515 fs_info->sectorsize); 514 fs_info->sectorsize);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 59e2dccdf75b..24338702ea5b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1581,6 +1581,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1581 struct btrfs_root *root = BTRFS_I(inode)->root; 1581 struct btrfs_root *root = BTRFS_I(inode)->root;
1582 struct page **pages = NULL; 1582 struct page **pages = NULL;
1583 struct extent_state *cached_state = NULL; 1583 struct extent_state *cached_state = NULL;
1584 struct extent_changeset *data_reserved = NULL;
1584 u64 release_bytes = 0; 1585 u64 release_bytes = 0;
1585 u64 lockstart; 1586 u64 lockstart;
1586 u64 lockend; 1587 u64 lockend;
@@ -1628,7 +1629,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1628 reserve_bytes = round_up(write_bytes + sector_offset, 1629 reserve_bytes = round_up(write_bytes + sector_offset,
1629 fs_info->sectorsize); 1630 fs_info->sectorsize);
1630 1631
1631 ret = btrfs_check_data_free_space(inode, pos, write_bytes); 1632 extent_changeset_release(data_reserved);
1633 ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
1634 write_bytes);
1632 if (ret < 0) { 1635 if (ret < 0) {
1633 if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | 1636 if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1634 BTRFS_INODE_PREALLOC)) && 1637 BTRFS_INODE_PREALLOC)) &&
@@ -1657,8 +1660,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1657 reserve_bytes); 1660 reserve_bytes);
1658 if (ret) { 1661 if (ret) {
1659 if (!only_release_metadata) 1662 if (!only_release_metadata)
1660 btrfs_free_reserved_data_space(inode, pos, 1663 btrfs_free_reserved_data_space(inode,
1661 write_bytes); 1664 data_reserved, pos,
1665 write_bytes);
1662 else 1666 else
1663 btrfs_end_write_no_snapshoting(root); 1667 btrfs_end_write_no_snapshoting(root);
1664 break; 1668 break;
@@ -1740,8 +1744,9 @@ again:
1740 __pos = round_down(pos, 1744 __pos = round_down(pos,
1741 fs_info->sectorsize) + 1745 fs_info->sectorsize) +
1742 (dirty_pages << PAGE_SHIFT); 1746 (dirty_pages << PAGE_SHIFT);
1743 btrfs_delalloc_release_space(inode, __pos, 1747 btrfs_delalloc_release_space(inode,
1744 release_bytes); 1748 data_reserved, __pos,
1749 release_bytes);
1745 } 1750 }
1746 } 1751 }
1747 1752
@@ -1796,12 +1801,13 @@ again:
1796 btrfs_delalloc_release_metadata(BTRFS_I(inode), 1801 btrfs_delalloc_release_metadata(BTRFS_I(inode),
1797 release_bytes); 1802 release_bytes);
1798 } else { 1803 } else {
1799 btrfs_delalloc_release_space(inode, 1804 btrfs_delalloc_release_space(inode, data_reserved,
1800 round_down(pos, fs_info->sectorsize), 1805 round_down(pos, fs_info->sectorsize),
1801 release_bytes); 1806 release_bytes);
1802 } 1807 }
1803 } 1808 }
1804 1809
1810 extent_changeset_free(data_reserved);
1805 return num_written ? num_written : ret; 1811 return num_written ? num_written : ret;
1806} 1812}
1807 1813
@@ -2405,10 +2411,13 @@ out:
2405 */ 2411 */
2406static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) 2412static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
2407{ 2413{
2414 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2408 struct extent_map *em; 2415 struct extent_map *em;
2409 int ret = 0; 2416 int ret = 0;
2410 2417
2411 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, *start, *len, 0); 2418 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
2419 round_down(*start, fs_info->sectorsize),
2420 round_up(*len, fs_info->sectorsize), 0);
2412 if (IS_ERR(em)) 2421 if (IS_ERR(em))
2413 return PTR_ERR(em); 2422 return PTR_ERR(em);
2414 2423
@@ -2784,6 +2793,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2784{ 2793{
2785 struct inode *inode = file_inode(file); 2794 struct inode *inode = file_inode(file);
2786 struct extent_state *cached_state = NULL; 2795 struct extent_state *cached_state = NULL;
2796 struct extent_changeset *data_reserved = NULL;
2787 struct falloc_range *range; 2797 struct falloc_range *range;
2788 struct falloc_range *tmp; 2798 struct falloc_range *tmp;
2789 struct list_head reserve_list; 2799 struct list_head reserve_list;
@@ -2913,8 +2923,8 @@ static long btrfs_fallocate(struct file *file, int mode,
2913 free_extent_map(em); 2923 free_extent_map(em);
2914 break; 2924 break;
2915 } 2925 }
2916 ret = btrfs_qgroup_reserve_data(inode, cur_offset, 2926 ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
2917 last_byte - cur_offset); 2927 cur_offset, last_byte - cur_offset);
2918 if (ret < 0) { 2928 if (ret < 0) {
2919 free_extent_map(em); 2929 free_extent_map(em);
2920 break; 2930 break;
@@ -2925,8 +2935,8 @@ static long btrfs_fallocate(struct file *file, int mode,
2925 * range, free reserved data space first, otherwise 2935 * range, free reserved data space first, otherwise
2926 * it'll result in false ENOSPC error. 2936 * it'll result in false ENOSPC error.
2927 */ 2937 */
2928 btrfs_free_reserved_data_space(inode, cur_offset, 2938 btrfs_free_reserved_data_space(inode, data_reserved,
2929 last_byte - cur_offset); 2939 cur_offset, last_byte - cur_offset);
2930 } 2940 }
2931 free_extent_map(em); 2941 free_extent_map(em);
2932 cur_offset = last_byte; 2942 cur_offset = last_byte;
@@ -2945,8 +2955,9 @@ static long btrfs_fallocate(struct file *file, int mode,
2945 range->len, i_blocksize(inode), 2955 range->len, i_blocksize(inode),
2946 offset + len, &alloc_hint); 2956 offset + len, &alloc_hint);
2947 else 2957 else
2948 btrfs_free_reserved_data_space(inode, range->start, 2958 btrfs_free_reserved_data_space(inode,
2949 range->len); 2959 data_reserved, range->start,
2960 range->len);
2950 list_del(&range->list); 2961 list_del(&range->list);
2951 kfree(range); 2962 kfree(range);
2952 } 2963 }
@@ -2984,8 +2995,9 @@ out:
2984 inode_unlock(inode); 2995 inode_unlock(inode);
2985 /* Let go of our reservation. */ 2996 /* Let go of our reservation. */
2986 if (ret != 0) 2997 if (ret != 0)
2987 btrfs_free_reserved_data_space(inode, alloc_start, 2998 btrfs_free_reserved_data_space(inode, data_reserved,
2988 alloc_end - cur_offset); 2999 alloc_start, alloc_end - cur_offset);
3000 extent_changeset_free(data_reserved);
2989 return ret; 3001 return ret;
2990} 3002}
2991 3003
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index fc0bd8406758..a5e34de06c2f 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -17,7 +17,7 @@
17 */ 17 */
18 18
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/vmalloc.h> 20#include <linux/sched/mm.h>
21#include "ctree.h" 21#include "ctree.h"
22#include "disk-io.h" 22#include "disk-io.h"
23#include "locking.h" 23#include "locking.h"
@@ -153,21 +153,21 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
153 153
154static u8 *alloc_bitmap(u32 bitmap_size) 154static u8 *alloc_bitmap(u32 bitmap_size)
155{ 155{
156 void *mem; 156 u8 *ret;
157 unsigned int nofs_flag;
157 158
158 /* 159 /*
159 * The allocation size varies, observed numbers were < 4K up to 16K. 160 * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse
160 * Using vmalloc unconditionally would be too heavy, we'll try 161 * into the filesystem as the free space bitmap can be modified in the
161 * contiguous allocations first. 162 * critical section of a transaction commit.
163 *
164 * TODO: push the memalloc_nofs_{save,restore}() to the caller where we
165 * know that recursion is unsafe.
162 */ 166 */
163 if (bitmap_size <= PAGE_SIZE) 167 nofs_flag = memalloc_nofs_save();
164 return kzalloc(bitmap_size, GFP_NOFS); 168 ret = kvzalloc(bitmap_size, GFP_KERNEL);
165 169 memalloc_nofs_restore(nofs_flag);
166 mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN); 170 return ret;
167 if (mem)
168 return mem;
169
170 return __vmalloc(bitmap_size, GFP_NOFS | __GFP_ZERO, PAGE_KERNEL);
171} 171}
172 172
173int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, 173int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
@@ -1188,11 +1188,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
1188 btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); 1188 btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
1189 clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); 1189 clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
1190 1190
1191 ret = btrfs_commit_transaction(trans); 1191 return btrfs_commit_transaction(trans);
1192 if (ret)
1193 return ret;
1194
1195 return 0;
1196 1192
1197abort: 1193abort:
1198 clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); 1194 clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
@@ -1277,11 +1273,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
1277 free_extent_buffer(free_space_root->commit_root); 1273 free_extent_buffer(free_space_root->commit_root);
1278 kfree(free_space_root); 1274 kfree(free_space_root);
1279 1275
1280 ret = btrfs_commit_transaction(trans); 1276 return btrfs_commit_transaction(trans);
1281 if (ret)
1282 return ret;
1283
1284 return 0;
1285 1277
1286abort: 1278abort:
1287 btrfs_abort_transaction(trans, ret); 1279 btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 5c6c20ec64d8..d02019747d00 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -400,6 +400,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
400 struct btrfs_path *path; 400 struct btrfs_path *path;
401 struct inode *inode; 401 struct inode *inode;
402 struct btrfs_block_rsv *rsv; 402 struct btrfs_block_rsv *rsv;
403 struct extent_changeset *data_reserved = NULL;
403 u64 num_bytes; 404 u64 num_bytes;
404 u64 alloc_hint = 0; 405 u64 alloc_hint = 0;
405 int ret; 406 int ret;
@@ -492,7 +493,7 @@ again:
492 /* Just to make sure we have enough space */ 493 /* Just to make sure we have enough space */
493 prealloc += 8 * PAGE_SIZE; 494 prealloc += 8 * PAGE_SIZE;
494 495
495 ret = btrfs_delalloc_reserve_space(inode, 0, prealloc); 496 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc);
496 if (ret) 497 if (ret)
497 goto out_put; 498 goto out_put;
498 499
@@ -516,6 +517,7 @@ out:
516 trans->bytes_reserved = num_bytes; 517 trans->bytes_reserved = num_bytes;
517 518
518 btrfs_free_path(path); 519 btrfs_free_path(path);
520 extent_changeset_free(data_reserved);
519 return ret; 521 return ret;
520} 522}
521 523
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 556c93060606..8d050314591c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -86,7 +86,6 @@ static const struct extent_io_ops btrfs_extent_io_ops;
86 86
87static struct kmem_cache *btrfs_inode_cachep; 87static struct kmem_cache *btrfs_inode_cachep;
88struct kmem_cache *btrfs_trans_handle_cachep; 88struct kmem_cache *btrfs_trans_handle_cachep;
89struct kmem_cache *btrfs_transaction_cachep;
90struct kmem_cache *btrfs_path_cachep; 89struct kmem_cache *btrfs_path_cachep;
91struct kmem_cache *btrfs_free_space_cachep; 90struct kmem_cache *btrfs_free_space_cachep;
92 91
@@ -178,7 +177,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
178 char *kaddr; 177 char *kaddr;
179 unsigned long ptr; 178 unsigned long ptr;
180 struct btrfs_file_extent_item *ei; 179 struct btrfs_file_extent_item *ei;
181 int err = 0;
182 int ret; 180 int ret;
183 size_t cur_size = size; 181 size_t cur_size = size;
184 unsigned long offset; 182 unsigned long offset;
@@ -200,10 +198,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
200 path->leave_spinning = 1; 198 path->leave_spinning = 1;
201 ret = btrfs_insert_empty_item(trans, root, path, &key, 199 ret = btrfs_insert_empty_item(trans, root, path, &key,
202 datasize); 200 datasize);
203 if (ret) { 201 if (ret)
204 err = ret;
205 goto fail; 202 goto fail;
206 }
207 } 203 }
208 leaf = path->nodes[0]; 204 leaf = path->nodes[0];
209 ei = btrfs_item_ptr(leaf, path->slots[0], 205 ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -258,9 +254,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
258 BTRFS_I(inode)->disk_i_size = inode->i_size; 254 BTRFS_I(inode)->disk_i_size = inode->i_size;
259 ret = btrfs_update_inode(trans, root, inode); 255 ret = btrfs_update_inode(trans, root, inode);
260 256
261 return ret;
262fail: 257fail:
263 return err; 258 return ret;
264} 259}
265 260
266 261
@@ -350,7 +345,7 @@ out:
350 * And at reserve time, it's always aligned to page size, so 345 * And at reserve time, it's always aligned to page size, so
351 * just free one page here. 346 * just free one page here.
352 */ 347 */
353 btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); 348 btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
354 btrfs_free_path(path); 349 btrfs_free_path(path);
355 btrfs_end_transaction(trans); 350 btrfs_end_transaction(trans);
356 return ret; 351 return ret;
@@ -608,12 +603,11 @@ cont:
608 603
609 /* 604 /*
610 * one last check to make sure the compression is really a 605 * one last check to make sure the compression is really a
611 * win, compare the page count read with the blocks on disk 606 * win, compare the page count read with the blocks on disk,
607 * compression must free at least one sector size
612 */ 608 */
613 total_in = ALIGN(total_in, PAGE_SIZE); 609 total_in = ALIGN(total_in, PAGE_SIZE);
614 if (total_compressed >= total_in) { 610 if (total_compressed + blocksize <= total_in) {
615 will_compress = 0;
616 } else {
617 num_bytes = total_in; 611 num_bytes = total_in;
618 *num_added += 1; 612 *num_added += 1;
619 613
@@ -1568,10 +1562,11 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1568/* 1562/*
1569 * extent_io.c call back to do delayed allocation processing 1563 * extent_io.c call back to do delayed allocation processing
1570 */ 1564 */
1571static int run_delalloc_range(struct inode *inode, struct page *locked_page, 1565static int run_delalloc_range(void *private_data, struct page *locked_page,
1572 u64 start, u64 end, int *page_started, 1566 u64 start, u64 end, int *page_started,
1573 unsigned long *nr_written) 1567 unsigned long *nr_written)
1574{ 1568{
1569 struct inode *inode = private_data;
1575 int ret; 1570 int ret;
1576 int force_cow = need_force_cow(inode, start, end); 1571 int force_cow = need_force_cow(inode, start, end);
1577 1572
@@ -1595,9 +1590,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1595 return ret; 1590 return ret;
1596} 1591}
1597 1592
1598static void btrfs_split_extent_hook(struct inode *inode, 1593static void btrfs_split_extent_hook(void *private_data,
1599 struct extent_state *orig, u64 split) 1594 struct extent_state *orig, u64 split)
1600{ 1595{
1596 struct inode *inode = private_data;
1601 u64 size; 1597 u64 size;
1602 1598
1603 /* not delalloc, ignore it */ 1599 /* not delalloc, ignore it */
@@ -1632,10 +1628,11 @@ static void btrfs_split_extent_hook(struct inode *inode,
1632 * extents, such as when we are doing sequential writes, so we can properly 1628 * extents, such as when we are doing sequential writes, so we can properly
1633 * account for the metadata space we'll need. 1629 * account for the metadata space we'll need.
1634 */ 1630 */
1635static void btrfs_merge_extent_hook(struct inode *inode, 1631static void btrfs_merge_extent_hook(void *private_data,
1636 struct extent_state *new, 1632 struct extent_state *new,
1637 struct extent_state *other) 1633 struct extent_state *other)
1638{ 1634{
1635 struct inode *inode = private_data;
1639 u64 new_size, old_size; 1636 u64 new_size, old_size;
1640 u32 num_extents; 1637 u32 num_extents;
1641 1638
@@ -1735,9 +1732,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1735 * bytes in this file, and to maintain the list of inodes that 1732 * bytes in this file, and to maintain the list of inodes that
1736 * have pending delalloc work to be done. 1733 * have pending delalloc work to be done.
1737 */ 1734 */
1738static void btrfs_set_bit_hook(struct inode *inode, 1735static void btrfs_set_bit_hook(void *private_data,
1739 struct extent_state *state, unsigned *bits) 1736 struct extent_state *state, unsigned *bits)
1740{ 1737{
1738 struct inode *inode = private_data;
1741 1739
1742 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1740 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1743 1741
@@ -1789,10 +1787,11 @@ static void btrfs_set_bit_hook(struct inode *inode,
1789/* 1787/*
1790 * extent_io.c clear_bit_hook, see set_bit_hook for why 1788 * extent_io.c clear_bit_hook, see set_bit_hook for why
1791 */ 1789 */
1792static void btrfs_clear_bit_hook(struct btrfs_inode *inode, 1790static void btrfs_clear_bit_hook(void *private_data,
1793 struct extent_state *state, 1791 struct extent_state *state,
1794 unsigned *bits) 1792 unsigned *bits)
1795{ 1793{
1794 struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
1796 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 1795 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1797 u64 len = state->end + 1 - state->start; 1796 u64 len = state->end + 1 - state->start;
1798 u32 num_extents = count_max_extents(len); 1797 u32 num_extents = count_max_extents(len);
@@ -1900,10 +1899,11 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1900 * At IO completion time the cums attached on the ordered extent record 1899 * At IO completion time the cums attached on the ordered extent record
1901 * are inserted into the btree 1900 * are inserted into the btree
1902 */ 1901 */
1903static blk_status_t __btrfs_submit_bio_start(struct inode *inode, 1902static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
1904 struct bio *bio, int mirror_num, unsigned long bio_flags, 1903 int mirror_num, unsigned long bio_flags,
1905 u64 bio_offset) 1904 u64 bio_offset)
1906{ 1905{
1906 struct inode *inode = private_data;
1907 blk_status_t ret = 0; 1907 blk_status_t ret = 0;
1908 1908
1909 ret = btrfs_csum_one_bio(inode, bio, 0, 0); 1909 ret = btrfs_csum_one_bio(inode, bio, 0, 0);
@@ -1919,10 +1919,11 @@ static blk_status_t __btrfs_submit_bio_start(struct inode *inode,
1919 * At IO completion time the cums attached on the ordered extent record 1919 * At IO completion time the cums attached on the ordered extent record
1920 * are inserted into the btree 1920 * are inserted into the btree
1921 */ 1921 */
1922static blk_status_t __btrfs_submit_bio_done(struct inode *inode, 1922static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
1923 struct bio *bio, int mirror_num, unsigned long bio_flags, 1923 int mirror_num, unsigned long bio_flags,
1924 u64 bio_offset) 1924 u64 bio_offset)
1925{ 1925{
1926 struct inode *inode = private_data;
1926 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1927 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1927 blk_status_t ret; 1928 blk_status_t ret;
1928 1929
@@ -1938,10 +1939,11 @@ static blk_status_t __btrfs_submit_bio_done(struct inode *inode,
1938 * extent_io.c submission hook. This does the right thing for csum calculation 1939 * extent_io.c submission hook. This does the right thing for csum calculation
1939 * on write, or reading the csums from the tree before a read 1940 * on write, or reading the csums from the tree before a read
1940 */ 1941 */
1941static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, 1942static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
1942 int mirror_num, unsigned long bio_flags, 1943 int mirror_num, unsigned long bio_flags,
1943 u64 bio_offset) 1944 u64 bio_offset)
1944{ 1945{
1946 struct inode *inode = private_data;
1945 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1947 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1946 struct btrfs_root *root = BTRFS_I(inode)->root; 1948 struct btrfs_root *root = BTRFS_I(inode)->root;
1947 enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; 1949 enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
@@ -1975,8 +1977,8 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
1975 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1977 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1976 goto mapit; 1978 goto mapit;
1977 /* we're doing a write, do the async checksumming */ 1979 /* we're doing a write, do the async checksumming */
1978 ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, 1980 ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
1979 bio_flags, bio_offset, 1981 bio_offset, inode,
1980 __btrfs_submit_bio_start, 1982 __btrfs_submit_bio_start,
1981 __btrfs_submit_bio_done); 1983 __btrfs_submit_bio_done);
1982 goto out; 1984 goto out;
@@ -2034,6 +2036,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2034 struct btrfs_writepage_fixup *fixup; 2036 struct btrfs_writepage_fixup *fixup;
2035 struct btrfs_ordered_extent *ordered; 2037 struct btrfs_ordered_extent *ordered;
2036 struct extent_state *cached_state = NULL; 2038 struct extent_state *cached_state = NULL;
2039 struct extent_changeset *data_reserved = NULL;
2037 struct page *page; 2040 struct page *page;
2038 struct inode *inode; 2041 struct inode *inode;
2039 u64 page_start; 2042 u64 page_start;
@@ -2071,7 +2074,7 @@ again:
2071 goto again; 2074 goto again;
2072 } 2075 }
2073 2076
2074 ret = btrfs_delalloc_reserve_space(inode, page_start, 2077 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2075 PAGE_SIZE); 2078 PAGE_SIZE);
2076 if (ret) { 2079 if (ret) {
2077 mapping_set_error(page->mapping, ret); 2080 mapping_set_error(page->mapping, ret);
@@ -2091,6 +2094,7 @@ out_page:
2091 unlock_page(page); 2094 unlock_page(page);
2092 put_page(page); 2095 put_page(page);
2093 kfree(fixup); 2096 kfree(fixup);
2097 extent_changeset_free(data_reserved);
2094} 2098}
2095 2099
2096/* 2100/*
@@ -2142,6 +2146,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2142 struct btrfs_path *path; 2146 struct btrfs_path *path;
2143 struct extent_buffer *leaf; 2147 struct extent_buffer *leaf;
2144 struct btrfs_key ins; 2148 struct btrfs_key ins;
2149 u64 qg_released;
2145 int extent_inserted = 0; 2150 int extent_inserted = 0;
2146 int ret; 2151 int ret;
2147 2152
@@ -2197,13 +2202,17 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2197 ins.objectid = disk_bytenr; 2202 ins.objectid = disk_bytenr;
2198 ins.offset = disk_num_bytes; 2203 ins.offset = disk_num_bytes;
2199 ins.type = BTRFS_EXTENT_ITEM_KEY; 2204 ins.type = BTRFS_EXTENT_ITEM_KEY;
2200 ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, 2205
2201 btrfs_ino(BTRFS_I(inode)), file_pos, ram_bytes, &ins);
2202 /* 2206 /*
2203 * Release the reserved range from inode dirty range map, as it is 2207 * Release the reserved range from inode dirty range map, as it is
2204 * already moved into delayed_ref_head 2208 * already moved into delayed_ref_head
2205 */ 2209 */
2206 btrfs_qgroup_release_data(inode, file_pos, ram_bytes); 2210 ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2211 if (ret < 0)
2212 goto out;
2213 qg_released = ret;
2214 ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
2215 btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins);
2207out: 2216out:
2208 btrfs_free_path(path); 2217 btrfs_free_path(path);
2209 2218
@@ -2925,7 +2934,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2925 * space for NOCOW range. 2934 * space for NOCOW range.
2926 * As NOCOW won't cause a new delayed ref, just free the space 2935 * As NOCOW won't cause a new delayed ref, just free the space
2927 */ 2936 */
2928 btrfs_qgroup_free_data(inode, ordered_extent->file_offset, 2937 btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
2929 ordered_extent->len); 2938 ordered_extent->len);
2930 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 2939 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2931 if (nolock) 2940 if (nolock)
@@ -4761,6 +4770,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4761 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4770 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4762 struct btrfs_ordered_extent *ordered; 4771 struct btrfs_ordered_extent *ordered;
4763 struct extent_state *cached_state = NULL; 4772 struct extent_state *cached_state = NULL;
4773 struct extent_changeset *data_reserved = NULL;
4764 char *kaddr; 4774 char *kaddr;
4765 u32 blocksize = fs_info->sectorsize; 4775 u32 blocksize = fs_info->sectorsize;
4766 pgoff_t index = from >> PAGE_SHIFT; 4776 pgoff_t index = from >> PAGE_SHIFT;
@@ -4775,7 +4785,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4775 (!len || ((len & (blocksize - 1)) == 0))) 4785 (!len || ((len & (blocksize - 1)) == 0)))
4776 goto out; 4786 goto out;
4777 4787
4778 ret = btrfs_delalloc_reserve_space(inode, 4788 ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
4779 round_down(from, blocksize), blocksize); 4789 round_down(from, blocksize), blocksize);
4780 if (ret) 4790 if (ret)
4781 goto out; 4791 goto out;
@@ -4783,7 +4793,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4783again: 4793again:
4784 page = find_or_create_page(mapping, index, mask); 4794 page = find_or_create_page(mapping, index, mask);
4785 if (!page) { 4795 if (!page) {
4786 btrfs_delalloc_release_space(inode, 4796 btrfs_delalloc_release_space(inode, data_reserved,
4787 round_down(from, blocksize), 4797 round_down(from, blocksize),
4788 blocksize); 4798 blocksize);
4789 ret = -ENOMEM; 4799 ret = -ENOMEM;
@@ -4855,11 +4865,12 @@ again:
4855 4865
4856out_unlock: 4866out_unlock:
4857 if (ret) 4867 if (ret)
4858 btrfs_delalloc_release_space(inode, block_start, 4868 btrfs_delalloc_release_space(inode, data_reserved, block_start,
4859 blocksize); 4869 blocksize);
4860 unlock_page(page); 4870 unlock_page(page);
4861 put_page(page); 4871 put_page(page);
4862out: 4872out:
4873 extent_changeset_free(data_reserved);
4863 return ret; 4874 return ret;
4864} 4875}
4865 4876
@@ -5254,7 +5265,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
5254 * Note, end is the bytenr of last byte, so we need + 1 here. 5265 * Note, end is the bytenr of last byte, so we need + 1 here.
5255 */ 5266 */
5256 if (state->state & EXTENT_DELALLOC) 5267 if (state->state & EXTENT_DELALLOC)
5257 btrfs_qgroup_free_data(inode, start, end - start + 1); 5268 btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
5258 5269
5259 clear_extent_bit(io_tree, start, end, 5270 clear_extent_bit(io_tree, start, end,
5260 EXTENT_LOCKED | EXTENT_DIRTY | 5271 EXTENT_LOCKED | EXTENT_DIRTY |
@@ -5867,7 +5878,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5867 struct inode *inode = file_inode(file); 5878 struct inode *inode = file_inode(file);
5868 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5879 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5869 struct btrfs_root *root = BTRFS_I(inode)->root; 5880 struct btrfs_root *root = BTRFS_I(inode)->root;
5870 struct btrfs_item *item;
5871 struct btrfs_dir_item *di; 5881 struct btrfs_dir_item *di;
5872 struct btrfs_key key; 5882 struct btrfs_key key;
5873 struct btrfs_key found_key; 5883 struct btrfs_key found_key;
@@ -5918,7 +5928,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5918 continue; 5928 continue;
5919 } 5929 }
5920 5930
5921 item = btrfs_item_nr(slot);
5922 btrfs_item_key_to_cpu(leaf, &found_key, slot); 5931 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5923 5932
5924 if (found_key.objectid != key.objectid) 5933 if (found_key.objectid != key.objectid)
@@ -5933,7 +5942,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5933 ctx->pos = found_key.offset; 5942 ctx->pos = found_key.offset;
5934 5943
5935 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 5944 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5936 if (verify_dir_item(fs_info, leaf, di)) 5945 if (verify_dir_item(fs_info, leaf, slot, di))
5937 goto next; 5946 goto next;
5938 5947
5939 name_len = btrfs_dir_name_len(leaf, di); 5948 name_len = btrfs_dir_name_len(leaf, di);
@@ -7479,7 +7488,7 @@ out:
7479bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end) 7488bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
7480{ 7489{
7481 struct radix_tree_root *root = &inode->i_mapping->page_tree; 7490 struct radix_tree_root *root = &inode->i_mapping->page_tree;
7482 int found = false; 7491 bool found = false;
7483 void **pagep = NULL; 7492 void **pagep = NULL;
7484 struct page *page = NULL; 7493 struct page *page = NULL;
7485 unsigned long start_idx; 7494 unsigned long start_idx;
@@ -7977,9 +7986,12 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7977 bio_end_io_t *repair_endio, void *repair_arg) 7986 bio_end_io_t *repair_endio, void *repair_arg)
7978{ 7987{
7979 struct io_failure_record *failrec; 7988 struct io_failure_record *failrec;
7989 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7990 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
7980 struct bio *bio; 7991 struct bio *bio;
7981 int isector; 7992 int isector;
7982 int read_mode = 0; 7993 int read_mode = 0;
7994 int segs;
7983 int ret; 7995 int ret;
7984 7996
7985 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 7997 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -7991,13 +8003,13 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7991 ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, 8003 ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
7992 failed_mirror); 8004 failed_mirror);
7993 if (!ret) { 8005 if (!ret) {
7994 free_io_failure(BTRFS_I(inode), failrec); 8006 free_io_failure(failure_tree, io_tree, failrec);
7995 return -EIO; 8007 return -EIO;
7996 } 8008 }
7997 8009
7998 if ((failed_bio->bi_vcnt > 1) 8010 segs = bio_segments(failed_bio);
7999 || (failed_bio->bi_io_vec->bv_len 8011 if (segs > 1 ||
8000 > btrfs_inode_sectorsize(inode))) 8012 (failed_bio->bi_io_vec->bv_len > btrfs_inode_sectorsize(inode)))
8001 read_mode |= REQ_FAILFAST_DEV; 8013 read_mode |= REQ_FAILFAST_DEV;
8002 8014
8003 isector = start - btrfs_io_bio(failed_bio)->logical; 8015 isector = start - btrfs_io_bio(failed_bio)->logical;
@@ -8005,7 +8017,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
8005 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 8017 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
8006 pgoff, isector, repair_endio, repair_arg); 8018 pgoff, isector, repair_endio, repair_arg);
8007 if (!bio) { 8019 if (!bio) {
8008 free_io_failure(BTRFS_I(inode), failrec); 8020 free_io_failure(failure_tree, io_tree, failrec);
8009 return -EIO; 8021 return -EIO;
8010 } 8022 }
8011 bio_set_op_attrs(bio, REQ_OP_READ, read_mode); 8023 bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
@@ -8016,7 +8028,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
8016 8028
8017 ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror); 8029 ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
8018 if (ret) { 8030 if (ret) {
8019 free_io_failure(BTRFS_I(inode), failrec); 8031 free_io_failure(failure_tree, io_tree, failrec);
8020 bio_put(bio); 8032 bio_put(bio);
8021 } 8033 }
8022 8034
@@ -8033,19 +8045,24 @@ struct btrfs_retry_complete {
8033static void btrfs_retry_endio_nocsum(struct bio *bio) 8045static void btrfs_retry_endio_nocsum(struct bio *bio)
8034{ 8046{
8035 struct btrfs_retry_complete *done = bio->bi_private; 8047 struct btrfs_retry_complete *done = bio->bi_private;
8048 struct inode *inode = done->inode;
8036 struct bio_vec *bvec; 8049 struct bio_vec *bvec;
8050 struct extent_io_tree *io_tree, *failure_tree;
8037 int i; 8051 int i;
8038 8052
8039 if (bio->bi_status) 8053 if (bio->bi_status)
8040 goto end; 8054 goto end;
8041 8055
8042 ASSERT(bio->bi_vcnt == 1); 8056 ASSERT(bio->bi_vcnt == 1);
8043 ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); 8057 io_tree = &BTRFS_I(inode)->io_tree;
8058 failure_tree = &BTRFS_I(inode)->io_failure_tree;
8059 ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
8044 8060
8045 done->uptodate = 1; 8061 done->uptodate = 1;
8046 bio_for_each_segment_all(bvec, bio, i) 8062 bio_for_each_segment_all(bvec, bio, i)
8047 clean_io_failure(BTRFS_I(done->inode), done->start, 8063 clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
8048 bvec->bv_page, 0); 8064 io_tree, done->start, bvec->bv_page,
8065 btrfs_ino(BTRFS_I(inode)), 0);
8049end: 8066end:
8050 complete(&done->done); 8067 complete(&done->done);
8051 bio_put(bio); 8068 bio_put(bio);
@@ -8055,36 +8072,40 @@ static int __btrfs_correct_data_nocsum(struct inode *inode,
8055 struct btrfs_io_bio *io_bio) 8072 struct btrfs_io_bio *io_bio)
8056{ 8073{
8057 struct btrfs_fs_info *fs_info; 8074 struct btrfs_fs_info *fs_info;
8058 struct bio_vec *bvec; 8075 struct bio_vec bvec;
8076 struct bvec_iter iter;
8059 struct btrfs_retry_complete done; 8077 struct btrfs_retry_complete done;
8060 u64 start; 8078 u64 start;
8061 unsigned int pgoff; 8079 unsigned int pgoff;
8062 u32 sectorsize; 8080 u32 sectorsize;
8063 int nr_sectors; 8081 int nr_sectors;
8064 int i;
8065 int ret; 8082 int ret;
8083 int err = 0;
8066 8084
8067 fs_info = BTRFS_I(inode)->root->fs_info; 8085 fs_info = BTRFS_I(inode)->root->fs_info;
8068 sectorsize = fs_info->sectorsize; 8086 sectorsize = fs_info->sectorsize;
8069 8087
8070 start = io_bio->logical; 8088 start = io_bio->logical;
8071 done.inode = inode; 8089 done.inode = inode;
8090 io_bio->bio.bi_iter = io_bio->iter;
8072 8091
8073 bio_for_each_segment_all(bvec, &io_bio->bio, i) { 8092 bio_for_each_segment(bvec, &io_bio->bio, iter) {
8074 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); 8093 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
8075 pgoff = bvec->bv_offset; 8094 pgoff = bvec.bv_offset;
8076 8095
8077next_block_or_try_again: 8096next_block_or_try_again:
8078 done.uptodate = 0; 8097 done.uptodate = 0;
8079 done.start = start; 8098 done.start = start;
8080 init_completion(&done.done); 8099 init_completion(&done.done);
8081 8100
8082 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, 8101 ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
8083 pgoff, start, start + sectorsize - 1, 8102 pgoff, start, start + sectorsize - 1,
8084 io_bio->mirror_num, 8103 io_bio->mirror_num,
8085 btrfs_retry_endio_nocsum, &done); 8104 btrfs_retry_endio_nocsum, &done);
8086 if (ret) 8105 if (ret) {
8087 return ret; 8106 err = ret;
8107 goto next;
8108 }
8088 8109
8089 wait_for_completion(&done.done); 8110 wait_for_completion(&done.done);
8090 8111
@@ -8093,6 +8114,7 @@ next_block_or_try_again:
8093 goto next_block_or_try_again; 8114 goto next_block_or_try_again;
8094 } 8115 }
8095 8116
8117next:
8096 start += sectorsize; 8118 start += sectorsize;
8097 8119
8098 nr_sectors--; 8120 nr_sectors--;
@@ -8103,13 +8125,15 @@ next_block_or_try_again:
8103 } 8125 }
8104 } 8126 }
8105 8127
8106 return 0; 8128 return err;
8107} 8129}
8108 8130
8109static void btrfs_retry_endio(struct bio *bio) 8131static void btrfs_retry_endio(struct bio *bio)
8110{ 8132{
8111 struct btrfs_retry_complete *done = bio->bi_private; 8133 struct btrfs_retry_complete *done = bio->bi_private;
8112 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 8134 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8135 struct extent_io_tree *io_tree, *failure_tree;
8136 struct inode *inode = done->inode;
8113 struct bio_vec *bvec; 8137 struct bio_vec *bvec;
8114 int uptodate; 8138 int uptodate;
8115 int ret; 8139 int ret;
@@ -8123,13 +8147,19 @@ static void btrfs_retry_endio(struct bio *bio)
8123 ASSERT(bio->bi_vcnt == 1); 8147 ASSERT(bio->bi_vcnt == 1);
8124 ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); 8148 ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
8125 8149
8150 io_tree = &BTRFS_I(inode)->io_tree;
8151 failure_tree = &BTRFS_I(inode)->io_failure_tree;
8152
8126 bio_for_each_segment_all(bvec, bio, i) { 8153 bio_for_each_segment_all(bvec, bio, i) {
8127 ret = __readpage_endio_check(done->inode, io_bio, i, 8154 ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
8128 bvec->bv_page, bvec->bv_offset, 8155 bvec->bv_offset, done->start,
8129 done->start, bvec->bv_len); 8156 bvec->bv_len);
8130 if (!ret) 8157 if (!ret)
8131 clean_io_failure(BTRFS_I(done->inode), done->start, 8158 clean_io_failure(BTRFS_I(inode)->root->fs_info,
8132 bvec->bv_page, bvec->bv_offset); 8159 failure_tree, io_tree, done->start,
8160 bvec->bv_page,
8161 btrfs_ino(BTRFS_I(inode)),
8162 bvec->bv_offset);
8133 else 8163 else
8134 uptodate = 0; 8164 uptodate = 0;
8135 } 8165 }
@@ -8144,7 +8174,8 @@ static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
8144 struct btrfs_io_bio *io_bio, blk_status_t err) 8174 struct btrfs_io_bio *io_bio, blk_status_t err)
8145{ 8175{
8146 struct btrfs_fs_info *fs_info; 8176 struct btrfs_fs_info *fs_info;
8147 struct bio_vec *bvec; 8177 struct bio_vec bvec;
8178 struct bvec_iter iter;
8148 struct btrfs_retry_complete done; 8179 struct btrfs_retry_complete done;
8149 u64 start; 8180 u64 start;
8150 u64 offset = 0; 8181 u64 offset = 0;
@@ -8152,7 +8183,7 @@ static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
8152 int nr_sectors; 8183 int nr_sectors;
8153 unsigned int pgoff; 8184 unsigned int pgoff;
8154 int csum_pos; 8185 int csum_pos;
8155 int i; 8186 bool uptodate = (err == 0);
8156 int ret; 8187 int ret;
8157 8188
8158 fs_info = BTRFS_I(inode)->root->fs_info; 8189 fs_info = BTRFS_I(inode)->root->fs_info;
@@ -8161,24 +8192,26 @@ static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
8161 err = 0; 8192 err = 0;
8162 start = io_bio->logical; 8193 start = io_bio->logical;
8163 done.inode = inode; 8194 done.inode = inode;
8195 io_bio->bio.bi_iter = io_bio->iter;
8164 8196
8165 bio_for_each_segment_all(bvec, &io_bio->bio, i) { 8197 bio_for_each_segment(bvec, &io_bio->bio, iter) {
8166 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); 8198 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
8167 8199
8168 pgoff = bvec->bv_offset; 8200 pgoff = bvec.bv_offset;
8169next_block: 8201next_block:
8170 csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); 8202 if (uptodate) {
8171 ret = __readpage_endio_check(inode, io_bio, csum_pos, 8203 csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
8172 bvec->bv_page, pgoff, start, 8204 ret = __readpage_endio_check(inode, io_bio, csum_pos,
8173 sectorsize); 8205 bvec.bv_page, pgoff, start, sectorsize);
8174 if (likely(!ret)) 8206 if (likely(!ret))
8175 goto next; 8207 goto next;
8208 }
8176try_again: 8209try_again:
8177 done.uptodate = 0; 8210 done.uptodate = 0;
8178 done.start = start; 8211 done.start = start;
8179 init_completion(&done.done); 8212 init_completion(&done.done);
8180 8213
8181 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, 8214 ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
8182 pgoff, start, start + sectorsize - 1, 8215 pgoff, start, start + sectorsize - 1,
8183 io_bio->mirror_num, 8216 io_bio->mirror_num,
8184 btrfs_retry_endio, &done); 8217 btrfs_retry_endio, &done);
@@ -8233,8 +8266,11 @@ static void btrfs_endio_direct_read(struct bio *bio)
8233 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 8266 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8234 blk_status_t err = bio->bi_status; 8267 blk_status_t err = bio->bi_status;
8235 8268
8236 if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) 8269 if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) {
8237 err = btrfs_subio_endio_read(inode, io_bio, err); 8270 err = btrfs_subio_endio_read(inode, io_bio, err);
8271 if (!err)
8272 bio->bi_status = 0;
8273 }
8238 8274
8239 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 8275 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
8240 dip->logical_offset + dip->bytes - 1); 8276 dip->logical_offset + dip->bytes - 1);
@@ -8307,10 +8343,11 @@ static void btrfs_endio_direct_write(struct bio *bio)
8307 bio_put(bio); 8343 bio_put(bio);
8308} 8344}
8309 8345
8310static blk_status_t __btrfs_submit_bio_start_direct_io(struct inode *inode, 8346static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data,
8311 struct bio *bio, int mirror_num, 8347 struct bio *bio, int mirror_num,
8312 unsigned long bio_flags, u64 offset) 8348 unsigned long bio_flags, u64 offset)
8313{ 8349{
8350 struct inode *inode = private_data;
8314 blk_status_t ret; 8351 blk_status_t ret;
8315 ret = btrfs_csum_one_bio(inode, bio, offset, 1); 8352 ret = btrfs_csum_one_bio(inode, bio, offset, 1);
8316 BUG_ON(ret); /* -ENOMEM */ 8353 BUG_ON(ret); /* -ENOMEM */
@@ -8357,16 +8394,6 @@ out:
8357 bio_put(bio); 8394 bio_put(bio);
8358} 8395}
8359 8396
8360static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
8361 u64 first_sector, gfp_t gfp_flags)
8362{
8363 struct bio *bio;
8364 bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
8365 if (bio)
8366 bio_associate_current(bio);
8367 return bio;
8368}
8369
8370static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, 8397static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
8371 struct btrfs_dio_private *dip, 8398 struct btrfs_dio_private *dip,
8372 struct bio *bio, 8399 struct bio *bio,
@@ -8422,8 +8449,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
8422 goto map; 8449 goto map;
8423 8450
8424 if (write && async_submit) { 8451 if (write && async_submit) {
8425 ret = btrfs_wq_submit_bio(fs_info, inode, bio, 0, 0, 8452 ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
8426 file_offset, 8453 file_offset, inode,
8427 __btrfs_submit_bio_start_direct_io, 8454 __btrfs_submit_bio_start_direct_io,
8428 __btrfs_submit_bio_done); 8455 __btrfs_submit_bio_done);
8429 goto err; 8456 goto err;
@@ -8453,103 +8480,83 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
8453{ 8480{
8454 struct inode *inode = dip->inode; 8481 struct inode *inode = dip->inode;
8455 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8482 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8456 struct btrfs_root *root = BTRFS_I(inode)->root;
8457 struct bio *bio; 8483 struct bio *bio;
8458 struct bio *orig_bio = dip->orig_bio; 8484 struct bio *orig_bio = dip->orig_bio;
8459 struct bio_vec *bvec;
8460 u64 start_sector = orig_bio->bi_iter.bi_sector; 8485 u64 start_sector = orig_bio->bi_iter.bi_sector;
8461 u64 file_offset = dip->logical_offset; 8486 u64 file_offset = dip->logical_offset;
8462 u64 submit_len = 0;
8463 u64 map_length; 8487 u64 map_length;
8464 u32 blocksize = fs_info->sectorsize;
8465 int async_submit = 0; 8488 int async_submit = 0;
8466 int nr_sectors; 8489 u64 submit_len;
8490 int clone_offset = 0;
8491 int clone_len;
8467 int ret; 8492 int ret;
8468 int i, j;
8469 8493
8470 map_length = orig_bio->bi_iter.bi_size; 8494 map_length = orig_bio->bi_iter.bi_size;
8495 submit_len = map_length;
8471 ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, 8496 ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
8472 &map_length, NULL, 0); 8497 &map_length, NULL, 0);
8473 if (ret) 8498 if (ret)
8474 return -EIO; 8499 return -EIO;
8475 8500
8476 if (map_length >= orig_bio->bi_iter.bi_size) { 8501 if (map_length >= submit_len) {
8477 bio = orig_bio; 8502 bio = orig_bio;
8478 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; 8503 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
8479 goto submit; 8504 goto submit;
8480 } 8505 }
8481 8506
8482 /* async crcs make it difficult to collect full stripe writes. */ 8507 /* async crcs make it difficult to collect full stripe writes. */
8483 if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK) 8508 if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK)
8484 async_submit = 0; 8509 async_submit = 0;
8485 else 8510 else
8486 async_submit = 1; 8511 async_submit = 1;
8487 8512
8488 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 8513 /* bio split */
8489 if (!bio) 8514 ASSERT(map_length <= INT_MAX);
8490 return -ENOMEM;
8491
8492 bio->bi_opf = orig_bio->bi_opf;
8493 bio->bi_private = dip;
8494 bio->bi_end_io = btrfs_end_dio_bio;
8495 btrfs_io_bio(bio)->logical = file_offset;
8496 atomic_inc(&dip->pending_bios); 8515 atomic_inc(&dip->pending_bios);
8516 do {
8517 clone_len = min_t(int, submit_len, map_length);
8497 8518
8498 bio_for_each_segment_all(bvec, orig_bio, j) { 8519 /*
8499 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); 8520 * This will never fail as it's passing GPF_NOFS and
8500 i = 0; 8521 * the allocation is backed by btrfs_bioset.
8501next_block: 8522 */
8502 if (unlikely(map_length < submit_len + blocksize || 8523 bio = btrfs_bio_clone_partial(orig_bio, clone_offset,
8503 bio_add_page(bio, bvec->bv_page, blocksize, 8524 clone_len);
8504 bvec->bv_offset + (i * blocksize)) < blocksize)) { 8525 bio->bi_private = dip;
8505 /* 8526 bio->bi_end_io = btrfs_end_dio_bio;
8506 * inc the count before we submit the bio so 8527 btrfs_io_bio(bio)->logical = file_offset;
8507 * we know the end IO handler won't happen before 8528
8508 * we inc the count. Otherwise, the dip might get freed 8529 ASSERT(submit_len >= clone_len);
8509 * before we're done setting it up 8530 submit_len -= clone_len;
8510 */ 8531 if (submit_len == 0)
8511 atomic_inc(&dip->pending_bios); 8532 break;
8512 ret = __btrfs_submit_dio_bio(bio, inode,
8513 file_offset, skip_sum,
8514 async_submit);
8515 if (ret) {
8516 bio_put(bio);
8517 atomic_dec(&dip->pending_bios);
8518 goto out_err;
8519 }
8520
8521 start_sector += submit_len >> 9;
8522 file_offset += submit_len;
8523 8533
8524 submit_len = 0; 8534 /*
8535 * Increase the count before we submit the bio so we know
8536 * the end IO handler won't happen before we increase the
8537 * count. Otherwise, the dip might get freed before we're
8538 * done setting it up.
8539 */
8540 atomic_inc(&dip->pending_bios);
8525 8541
8526 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, 8542 ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
8527 start_sector, GFP_NOFS); 8543 async_submit);
8528 if (!bio) 8544 if (ret) {
8529 goto out_err; 8545 bio_put(bio);
8530 bio->bi_opf = orig_bio->bi_opf; 8546 atomic_dec(&dip->pending_bios);
8531 bio->bi_private = dip; 8547 goto out_err;
8532 bio->bi_end_io = btrfs_end_dio_bio; 8548 }
8533 btrfs_io_bio(bio)->logical = file_offset;
8534 8549
8535 map_length = orig_bio->bi_iter.bi_size; 8550 clone_offset += clone_len;
8536 ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), 8551 start_sector += clone_len >> 9;
8537 start_sector << 9, 8552 file_offset += clone_len;
8538 &map_length, NULL, 0);
8539 if (ret) {
8540 bio_put(bio);
8541 goto out_err;
8542 }
8543 8553
8544 goto next_block; 8554 map_length = submit_len;
8545 } else { 8555 ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
8546 submit_len += blocksize; 8556 start_sector << 9, &map_length, NULL, 0);
8547 if (--nr_sectors) { 8557 if (ret)
8548 i++; 8558 goto out_err;
8549 goto next_block; 8559 } while (submit_len > 0);
8550 }
8551 }
8552 }
8553 8560
8554submit: 8561submit:
8555 ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, 8562 ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
@@ -8576,19 +8583,15 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
8576 loff_t file_offset) 8583 loff_t file_offset)
8577{ 8584{
8578 struct btrfs_dio_private *dip = NULL; 8585 struct btrfs_dio_private *dip = NULL;
8579 struct bio *io_bio = NULL; 8586 struct bio *bio = NULL;
8580 struct btrfs_io_bio *btrfs_bio; 8587 struct btrfs_io_bio *io_bio;
8581 int skip_sum; 8588 int skip_sum;
8582 bool write = (bio_op(dio_bio) == REQ_OP_WRITE); 8589 bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
8583 int ret = 0; 8590 int ret = 0;
8584 8591
8585 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 8592 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8586 8593
8587 io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); 8594 bio = btrfs_bio_clone(dio_bio);
8588 if (!io_bio) {
8589 ret = -ENOMEM;
8590 goto free_ordered;
8591 }
8592 8595
8593 dip = kzalloc(sizeof(*dip), GFP_NOFS); 8596 dip = kzalloc(sizeof(*dip), GFP_NOFS);
8594 if (!dip) { 8597 if (!dip) {
@@ -8601,17 +8604,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
8601 dip->logical_offset = file_offset; 8604 dip->logical_offset = file_offset;
8602 dip->bytes = dio_bio->bi_iter.bi_size; 8605 dip->bytes = dio_bio->bi_iter.bi_size;
8603 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; 8606 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
8604 io_bio->bi_private = dip; 8607 bio->bi_private = dip;
8605 dip->orig_bio = io_bio; 8608 dip->orig_bio = bio;
8606 dip->dio_bio = dio_bio; 8609 dip->dio_bio = dio_bio;
8607 atomic_set(&dip->pending_bios, 0); 8610 atomic_set(&dip->pending_bios, 0);
8608 btrfs_bio = btrfs_io_bio(io_bio); 8611 io_bio = btrfs_io_bio(bio);
8609 btrfs_bio->logical = file_offset; 8612 io_bio->logical = file_offset;
8610 8613
8611 if (write) { 8614 if (write) {
8612 io_bio->bi_end_io = btrfs_endio_direct_write; 8615 bio->bi_end_io = btrfs_endio_direct_write;
8613 } else { 8616 } else {
8614 io_bio->bi_end_io = btrfs_endio_direct_read; 8617 bio->bi_end_io = btrfs_endio_direct_read;
8615 dip->subio_endio = btrfs_subio_endio_read; 8618 dip->subio_endio = btrfs_subio_endio_read;
8616 } 8619 }
8617 8620
@@ -8634,8 +8637,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
8634 if (!ret) 8637 if (!ret)
8635 return; 8638 return;
8636 8639
8637 if (btrfs_bio->end_io) 8640 if (io_bio->end_io)
8638 btrfs_bio->end_io(btrfs_bio, ret); 8641 io_bio->end_io(io_bio, ret);
8639 8642
8640free_ordered: 8643free_ordered:
8641 /* 8644 /*
@@ -8647,16 +8650,15 @@ free_ordered:
8647 * same as btrfs_endio_direct_[write|read] because we can't call these 8650 * same as btrfs_endio_direct_[write|read] because we can't call these
8648 * callbacks - they require an allocated dip and a clone of dio_bio. 8651 * callbacks - they require an allocated dip and a clone of dio_bio.
8649 */ 8652 */
8650 if (io_bio && dip) { 8653 if (bio && dip) {
8651 io_bio->bi_status = BLK_STS_IOERR; 8654 bio_io_error(bio);
8652 bio_endio(io_bio);
8653 /* 8655 /*
8654 * The end io callbacks free our dip, do the final put on io_bio 8656 * The end io callbacks free our dip, do the final put on bio
8655 * and all the cleanup and final put for dio_bio (through 8657 * and all the cleanup and final put for dio_bio (through
8656 * dio_end_io()). 8658 * dio_end_io()).
8657 */ 8659 */
8658 dip = NULL; 8660 dip = NULL;
8659 io_bio = NULL; 8661 bio = NULL;
8660 } else { 8662 } else {
8661 if (write) 8663 if (write)
8662 __endio_write_update_ordered(inode, 8664 __endio_write_update_ordered(inode,
@@ -8674,8 +8676,8 @@ free_ordered:
8674 */ 8676 */
8675 dio_end_io(dio_bio); 8677 dio_end_io(dio_bio);
8676 } 8678 }
8677 if (io_bio) 8679 if (bio)
8678 bio_put(io_bio); 8680 bio_put(bio);
8679 kfree(dip); 8681 kfree(dip);
8680} 8682}
8681 8683
@@ -8719,6 +8721,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8719 struct inode *inode = file->f_mapping->host; 8721 struct inode *inode = file->f_mapping->host;
8720 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 8722 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8721 struct btrfs_dio_data dio_data = { 0 }; 8723 struct btrfs_dio_data dio_data = { 0 };
8724 struct extent_changeset *data_reserved = NULL;
8722 loff_t offset = iocb->ki_pos; 8725 loff_t offset = iocb->ki_pos;
8723 size_t count = 0; 8726 size_t count = 0;
8724 int flags = 0; 8727 int flags = 0;
@@ -8758,7 +8761,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8758 ret = -EAGAIN; 8761 ret = -EAGAIN;
8759 goto out; 8762 goto out;
8760 } 8763 }
8761 ret = btrfs_delalloc_reserve_space(inode, offset, count); 8764 ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
8765 offset, count);
8762 if (ret) 8766 if (ret)
8763 goto out; 8767 goto out;
8764 dio_data.outstanding_extents = count_max_extents(count); 8768 dio_data.outstanding_extents = count_max_extents(count);
@@ -8790,8 +8794,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8790 current->journal_info = NULL; 8794 current->journal_info = NULL;
8791 if (ret < 0 && ret != -EIOCBQUEUED) { 8795 if (ret < 0 && ret != -EIOCBQUEUED) {
8792 if (dio_data.reserve) 8796 if (dio_data.reserve)
8793 btrfs_delalloc_release_space(inode, offset, 8797 btrfs_delalloc_release_space(inode, data_reserved,
8794 dio_data.reserve); 8798 offset, dio_data.reserve);
8795 /* 8799 /*
8796 * On error we might have left some ordered extents 8800 * On error we might have left some ordered extents
8797 * without submitting corresponding bios for them, so 8801 * without submitting corresponding bios for them, so
@@ -8806,8 +8810,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8806 dio_data.unsubmitted_oe_range_start, 8810 dio_data.unsubmitted_oe_range_start,
8807 false); 8811 false);
8808 } else if (ret >= 0 && (size_t)ret < count) 8812 } else if (ret >= 0 && (size_t)ret < count)
8809 btrfs_delalloc_release_space(inode, offset, 8813 btrfs_delalloc_release_space(inode, data_reserved,
8810 count - (size_t)ret); 8814 offset, count - (size_t)ret);
8811 } 8815 }
8812out: 8816out:
8813 if (wakeup) 8817 if (wakeup)
@@ -8815,6 +8819,7 @@ out:
8815 if (relock) 8819 if (relock)
8816 inode_lock(inode); 8820 inode_lock(inode);
8817 8821
8822 extent_changeset_free(data_reserved);
8818 return ret; 8823 return ret;
8819} 8824}
8820 8825
@@ -9005,7 +9010,7 @@ again:
9005 * free the entire extent. 9010 * free the entire extent.
9006 */ 9011 */
9007 if (PageDirty(page)) 9012 if (PageDirty(page))
9008 btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE); 9013 btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
9009 if (!inode_evicting) { 9014 if (!inode_evicting) {
9010 clear_extent_bit(tree, page_start, page_end, 9015 clear_extent_bit(tree, page_start, page_end,
9011 EXTENT_LOCKED | EXTENT_DIRTY | 9016 EXTENT_LOCKED | EXTENT_DIRTY |
@@ -9047,6 +9052,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
9047 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 9052 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
9048 struct btrfs_ordered_extent *ordered; 9053 struct btrfs_ordered_extent *ordered;
9049 struct extent_state *cached_state = NULL; 9054 struct extent_state *cached_state = NULL;
9055 struct extent_changeset *data_reserved = NULL;
9050 char *kaddr; 9056 char *kaddr;
9051 unsigned long zero_start; 9057 unsigned long zero_start;
9052 loff_t size; 9058 loff_t size;
@@ -9072,7 +9078,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
9072 * end up waiting indefinitely to get a lock on the page currently 9078 * end up waiting indefinitely to get a lock on the page currently
9073 * being processed by btrfs_page_mkwrite() function. 9079 * being processed by btrfs_page_mkwrite() function.
9074 */ 9080 */
9075 ret = btrfs_delalloc_reserve_space(inode, page_start, 9081 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
9076 reserved_space); 9082 reserved_space);
9077 if (!ret) { 9083 if (!ret) {
9078 ret = file_update_time(vmf->vma->vm_file); 9084 ret = file_update_time(vmf->vma->vm_file);
@@ -9126,8 +9132,8 @@ again:
9126 spin_lock(&BTRFS_I(inode)->lock); 9132 spin_lock(&BTRFS_I(inode)->lock);
9127 BTRFS_I(inode)->outstanding_extents++; 9133 BTRFS_I(inode)->outstanding_extents++;
9128 spin_unlock(&BTRFS_I(inode)->lock); 9134 spin_unlock(&BTRFS_I(inode)->lock);
9129 btrfs_delalloc_release_space(inode, page_start, 9135 btrfs_delalloc_release_space(inode, data_reserved,
9130 PAGE_SIZE - reserved_space); 9136 page_start, PAGE_SIZE - reserved_space);
9131 } 9137 }
9132 } 9138 }
9133 9139
@@ -9178,13 +9184,16 @@ again:
9178out_unlock: 9184out_unlock:
9179 if (!ret) { 9185 if (!ret) {
9180 sb_end_pagefault(inode->i_sb); 9186 sb_end_pagefault(inode->i_sb);
9187 extent_changeset_free(data_reserved);
9181 return VM_FAULT_LOCKED; 9188 return VM_FAULT_LOCKED;
9182 } 9189 }
9183 unlock_page(page); 9190 unlock_page(page);
9184out: 9191out:
9185 btrfs_delalloc_release_space(inode, page_start, reserved_space); 9192 btrfs_delalloc_release_space(inode, data_reserved, page_start,
9193 reserved_space);
9186out_noreserve: 9194out_noreserve:
9187 sb_end_pagefault(inode->i_sb); 9195 sb_end_pagefault(inode->i_sb);
9196 extent_changeset_free(data_reserved);
9188 return ret; 9197 return ret;
9189} 9198}
9190 9199
@@ -9406,8 +9415,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
9406 9415
9407 inode = &ei->vfs_inode; 9416 inode = &ei->vfs_inode;
9408 extent_map_tree_init(&ei->extent_tree); 9417 extent_map_tree_init(&ei->extent_tree);
9409 extent_io_tree_init(&ei->io_tree, &inode->i_data); 9418 extent_io_tree_init(&ei->io_tree, inode);
9410 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 9419 extent_io_tree_init(&ei->io_failure_tree, inode);
9411 ei->io_tree.track_uptodate = 1; 9420 ei->io_tree.track_uptodate = 1;
9412 ei->io_failure_tree.track_uptodate = 1; 9421 ei->io_failure_tree.track_uptodate = 1;
9413 atomic_set(&ei->sync_writers, 0); 9422 atomic_set(&ei->sync_writers, 0);
@@ -9516,7 +9525,6 @@ void btrfs_destroy_cachep(void)
9516 rcu_barrier(); 9525 rcu_barrier();
9517 kmem_cache_destroy(btrfs_inode_cachep); 9526 kmem_cache_destroy(btrfs_inode_cachep);
9518 kmem_cache_destroy(btrfs_trans_handle_cachep); 9527 kmem_cache_destroy(btrfs_trans_handle_cachep);
9519 kmem_cache_destroy(btrfs_transaction_cachep);
9520 kmem_cache_destroy(btrfs_path_cachep); 9528 kmem_cache_destroy(btrfs_path_cachep);
9521 kmem_cache_destroy(btrfs_free_space_cachep); 9529 kmem_cache_destroy(btrfs_free_space_cachep);
9522} 9530}
@@ -9536,12 +9544,6 @@ int btrfs_init_cachep(void)
9536 if (!btrfs_trans_handle_cachep) 9544 if (!btrfs_trans_handle_cachep)
9537 goto fail; 9545 goto fail;
9538 9546
9539 btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
9540 sizeof(struct btrfs_transaction), 0,
9541 SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
9542 if (!btrfs_transaction_cachep)
9543 goto fail;
9544
9545 btrfs_path_cachep = kmem_cache_create("btrfs_path", 9547 btrfs_path_cachep = kmem_cache_create("btrfs_path",
9546 sizeof(struct btrfs_path), 0, 9548 sizeof(struct btrfs_path), 0,
9547 SLAB_MEM_SPREAD, NULL); 9549 SLAB_MEM_SPREAD, NULL);
@@ -9566,6 +9568,24 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat,
9566 u64 delalloc_bytes; 9568 u64 delalloc_bytes;
9567 struct inode *inode = d_inode(path->dentry); 9569 struct inode *inode = d_inode(path->dentry);
9568 u32 blocksize = inode->i_sb->s_blocksize; 9570 u32 blocksize = inode->i_sb->s_blocksize;
9571 u32 bi_flags = BTRFS_I(inode)->flags;
9572
9573 stat->result_mask |= STATX_BTIME;
9574 stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
9575 stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
9576 if (bi_flags & BTRFS_INODE_APPEND)
9577 stat->attributes |= STATX_ATTR_APPEND;
9578 if (bi_flags & BTRFS_INODE_COMPRESS)
9579 stat->attributes |= STATX_ATTR_COMPRESSED;
9580 if (bi_flags & BTRFS_INODE_IMMUTABLE)
9581 stat->attributes |= STATX_ATTR_IMMUTABLE;
9582 if (bi_flags & BTRFS_INODE_NODUMP)
9583 stat->attributes |= STATX_ATTR_NODUMP;
9584
9585 stat->attributes_mask |= (STATX_ATTR_APPEND |
9586 STATX_ATTR_COMPRESSED |
9587 STATX_ATTR_IMMUTABLE |
9588 STATX_ATTR_NODUMP);
9569 9589
9570 generic_fillattr(inode, stat); 9590 generic_fillattr(inode, stat);
9571 stat->dev = BTRFS_I(inode)->root->anon_dev; 9591 stat->dev = BTRFS_I(inode)->root->anon_dev;
@@ -10540,7 +10560,7 @@ next:
10540 btrfs_end_transaction(trans); 10560 btrfs_end_transaction(trans);
10541 } 10561 }
10542 if (cur_offset < end) 10562 if (cur_offset < end)
10543 btrfs_free_reserved_data_space(inode, cur_offset, 10563 btrfs_free_reserved_data_space(inode, NULL, cur_offset,
10544 end - cur_offset + 1); 10564 end - cur_offset + 1);
10545 return ret; 10565 return ret;
10546} 10566}
@@ -10661,6 +10681,42 @@ static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror)
10661 return -EAGAIN; 10681 return -EAGAIN;
10662} 10682}
10663 10683
10684static struct btrfs_fs_info *iotree_fs_info(void *private_data)
10685{
10686 struct inode *inode = private_data;
10687 return btrfs_sb(inode->i_sb);
10688}
10689
10690static void btrfs_check_extent_io_range(void *private_data, const char *caller,
10691 u64 start, u64 end)
10692{
10693 struct inode *inode = private_data;
10694 u64 isize;
10695
10696 isize = i_size_read(inode);
10697 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
10698 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
10699 "%s: ino %llu isize %llu odd range [%llu,%llu]",
10700 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
10701 }
10702}
10703
10704void btrfs_set_range_writeback(void *private_data, u64 start, u64 end)
10705{
10706 struct inode *inode = private_data;
10707 unsigned long index = start >> PAGE_SHIFT;
10708 unsigned long end_index = end >> PAGE_SHIFT;
10709 struct page *page;
10710
10711 while (index <= end_index) {
10712 page = find_get_page(inode->i_mapping, index);
10713 ASSERT(page); /* Pages should be in the extent_io_tree */
10714 set_page_writeback(page);
10715 put_page(page);
10716 index++;
10717 }
10718}
10719
10664static const struct inode_operations btrfs_dir_inode_operations = { 10720static const struct inode_operations btrfs_dir_inode_operations = {
10665 .getattr = btrfs_getattr, 10721 .getattr = btrfs_getattr,
10666 .lookup = btrfs_lookup, 10722 .lookup = btrfs_lookup,
@@ -10704,6 +10760,8 @@ static const struct extent_io_ops btrfs_extent_io_ops = {
10704 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 10760 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
10705 .merge_bio_hook = btrfs_merge_bio_hook, 10761 .merge_bio_hook = btrfs_merge_bio_hook,
10706 .readpage_io_failed_hook = btrfs_readpage_io_failed_hook, 10762 .readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
10763 .tree_fs_info = iotree_fs_info,
10764 .set_range_writeback = btrfs_set_range_writeback,
10707 10765
10708 /* optional callbacks */ 10766 /* optional callbacks */
10709 .fill_delalloc = run_delalloc_range, 10767 .fill_delalloc = run_delalloc_range,
@@ -10713,6 +10771,7 @@ static const struct extent_io_ops btrfs_extent_io_ops = {
10713 .clear_bit_hook = btrfs_clear_bit_hook, 10771 .clear_bit_hook = btrfs_clear_bit_hook,
10714 .merge_extent_hook = btrfs_merge_extent_hook, 10772 .merge_extent_hook = btrfs_merge_extent_hook,
10715 .split_extent_hook = btrfs_split_extent_hook, 10773 .split_extent_hook = btrfs_split_extent_hook,
10774 .check_extent_io_range = btrfs_check_extent_io_range,
10716}; 10775};
10717 10776
10718/* 10777/*
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e176375f374f..fa1b78cf25f6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -37,7 +37,7 @@
37#include <linux/bit_spinlock.h> 37#include <linux/bit_spinlock.h>
38#include <linux/security.h> 38#include <linux/security.h>
39#include <linux/xattr.h> 39#include <linux/xattr.h>
40#include <linux/vmalloc.h> 40#include <linux/mm.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/blkdev.h> 42#include <linux/blkdev.h>
43#include <linux/uuid.h> 43#include <linux/uuid.h>
@@ -689,7 +689,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
689 if (ret) 689 if (ret)
690 goto dec_and_free; 690 goto dec_and_free;
691 691
692 btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); 692 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
693 693
694 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 694 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
695 BTRFS_BLOCK_RSV_TEMP); 695 BTRFS_BLOCK_RSV_TEMP);
@@ -1127,6 +1127,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
1127 struct btrfs_ordered_extent *ordered; 1127 struct btrfs_ordered_extent *ordered;
1128 struct extent_state *cached_state = NULL; 1128 struct extent_state *cached_state = NULL;
1129 struct extent_io_tree *tree; 1129 struct extent_io_tree *tree;
1130 struct extent_changeset *data_reserved = NULL;
1130 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 1131 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1131 1132
1132 file_end = (isize - 1) >> PAGE_SHIFT; 1133 file_end = (isize - 1) >> PAGE_SHIFT;
@@ -1135,7 +1136,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
1135 1136
1136 page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); 1137 page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
1137 1138
1138 ret = btrfs_delalloc_reserve_space(inode, 1139 ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
1139 start_index << PAGE_SHIFT, 1140 start_index << PAGE_SHIFT,
1140 page_cnt << PAGE_SHIFT); 1141 page_cnt << PAGE_SHIFT);
1141 if (ret) 1142 if (ret)
@@ -1226,7 +1227,7 @@ again:
1226 spin_lock(&BTRFS_I(inode)->lock); 1227 spin_lock(&BTRFS_I(inode)->lock);
1227 BTRFS_I(inode)->outstanding_extents++; 1228 BTRFS_I(inode)->outstanding_extents++;
1228 spin_unlock(&BTRFS_I(inode)->lock); 1229 spin_unlock(&BTRFS_I(inode)->lock);
1229 btrfs_delalloc_release_space(inode, 1230 btrfs_delalloc_release_space(inode, data_reserved,
1230 start_index << PAGE_SHIFT, 1231 start_index << PAGE_SHIFT,
1231 (page_cnt - i_done) << PAGE_SHIFT); 1232 (page_cnt - i_done) << PAGE_SHIFT);
1232 } 1233 }
@@ -1247,15 +1248,17 @@ again:
1247 unlock_page(pages[i]); 1248 unlock_page(pages[i]);
1248 put_page(pages[i]); 1249 put_page(pages[i]);
1249 } 1250 }
1251 extent_changeset_free(data_reserved);
1250 return i_done; 1252 return i_done;
1251out: 1253out:
1252 for (i = 0; i < i_done; i++) { 1254 for (i = 0; i < i_done; i++) {
1253 unlock_page(pages[i]); 1255 unlock_page(pages[i]);
1254 put_page(pages[i]); 1256 put_page(pages[i]);
1255 } 1257 }
1256 btrfs_delalloc_release_space(inode, 1258 btrfs_delalloc_release_space(inode, data_reserved,
1257 start_index << PAGE_SHIFT, 1259 start_index << PAGE_SHIFT,
1258 page_cnt << PAGE_SHIFT); 1260 page_cnt << PAGE_SHIFT);
1261 extent_changeset_free(data_reserved);
1259 return ret; 1262 return ret;
1260 1263
1261} 1264}
@@ -4588,7 +4591,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
4588 4591
4589out: 4592out:
4590 btrfs_free_path(path); 4593 btrfs_free_path(path);
4591 vfree(inodes); 4594 kvfree(inodes);
4592 kfree(loi); 4595 kfree(loi);
4593 4596
4594 return ret; 4597 return ret;
@@ -4897,7 +4900,6 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
4897 goto out; 4900 goto out;
4898 } 4901 }
4899 4902
4900 /* FIXME: check if the IDs really exist */
4901 if (sa->assign) { 4903 if (sa->assign) {
4902 ret = btrfs_add_qgroup_relation(trans, fs_info, 4904 ret = btrfs_add_qgroup_relation(trans, fs_info,
4903 sa->src, sa->dst); 4905 sa->src, sa->dst);
@@ -4956,7 +4958,6 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
4956 goto out; 4958 goto out;
4957 } 4959 }
4958 4960
4959 /* FIXME: check if the IDs really exist */
4960 if (sa->create) { 4961 if (sa->create) {
4961 ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid); 4962 ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid);
4962 } else { 4963 } else {
@@ -5010,7 +5011,6 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
5010 qgroupid = root->root_key.objectid; 5011 qgroupid = root->root_key.objectid;
5011 } 5012 }
5012 5013
5013 /* FIXME: check if the IDs really exist */
5014 ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim); 5014 ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim);
5015 5015
5016 err = btrfs_end_transaction(trans); 5016 err = btrfs_end_transaction(trans);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index f48c8c14dc14..d433e75d489a 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -18,13 +18,14 @@
18 18
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/vmalloc.h> 21#include <linux/mm.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/err.h> 23#include <linux/err.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/pagemap.h> 25#include <linux/pagemap.h>
26#include <linux/bio.h> 26#include <linux/bio.h>
27#include <linux/lzo.h> 27#include <linux/lzo.h>
28#include <linux/refcount.h>
28#include "compression.h" 29#include "compression.h"
29 30
30#define LZO_LEN 4 31#define LZO_LEN 4
@@ -40,9 +41,9 @@ static void lzo_free_workspace(struct list_head *ws)
40{ 41{
41 struct workspace *workspace = list_entry(ws, struct workspace, list); 42 struct workspace *workspace = list_entry(ws, struct workspace, list);
42 43
43 vfree(workspace->buf); 44 kvfree(workspace->buf);
44 vfree(workspace->cbuf); 45 kvfree(workspace->cbuf);
45 vfree(workspace->mem); 46 kvfree(workspace->mem);
46 kfree(workspace); 47 kfree(workspace);
47} 48}
48 49
@@ -50,13 +51,13 @@ static struct list_head *lzo_alloc_workspace(void)
50{ 51{
51 struct workspace *workspace; 52 struct workspace *workspace;
52 53
53 workspace = kzalloc(sizeof(*workspace), GFP_NOFS); 54 workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
54 if (!workspace) 55 if (!workspace)
55 return ERR_PTR(-ENOMEM); 56 return ERR_PTR(-ENOMEM);
56 57
57 workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); 58 workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
58 workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); 59 workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
59 workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_SIZE)); 60 workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
60 if (!workspace->mem || !workspace->buf || !workspace->cbuf) 61 if (!workspace->mem || !workspace->buf || !workspace->cbuf)
61 goto fail; 62 goto fail;
62 63
@@ -141,7 +142,7 @@ static int lzo_compress_pages(struct list_head *ws,
141 ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, 142 ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
142 &out_len, workspace->mem); 143 &out_len, workspace->mem);
143 if (ret != LZO_E_OK) { 144 if (ret != LZO_E_OK) {
144 pr_debug("BTRFS: deflate in loop returned %d\n", 145 pr_debug("BTRFS: lzo in loop returned %d\n",
145 ret); 146 ret);
146 ret = -EIO; 147 ret = -EIO;
147 goto out; 148 goto out;
@@ -229,8 +230,10 @@ static int lzo_compress_pages(struct list_head *ws,
229 in_len = min(bytes_left, PAGE_SIZE); 230 in_len = min(bytes_left, PAGE_SIZE);
230 } 231 }
231 232
232 if (tot_out > tot_in) 233 if (tot_out >= tot_in) {
234 ret = -E2BIG;
233 goto out; 235 goto out;
236 }
234 237
235 /* store the size of all chunks of compressed data */ 238 /* store the size of all chunks of compressed data */
236 cpage_out = kmap(pages[0]); 239 cpage_out = kmap(pages[0]);
@@ -254,16 +257,13 @@ out:
254 return ret; 257 return ret;
255} 258}
256 259
257static int lzo_decompress_bio(struct list_head *ws, 260static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
258 struct page **pages_in,
259 u64 disk_start,
260 struct bio *orig_bio,
261 size_t srclen)
262{ 261{
263 struct workspace *workspace = list_entry(ws, struct workspace, list); 262 struct workspace *workspace = list_entry(ws, struct workspace, list);
264 int ret = 0, ret2; 263 int ret = 0, ret2;
265 char *data_in; 264 char *data_in;
266 unsigned long page_in_index = 0; 265 unsigned long page_in_index = 0;
266 size_t srclen = cb->compressed_len;
267 unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); 267 unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
268 unsigned long buf_start; 268 unsigned long buf_start;
269 unsigned long buf_offset = 0; 269 unsigned long buf_offset = 0;
@@ -278,6 +278,9 @@ static int lzo_decompress_bio(struct list_head *ws,
278 unsigned long tot_len; 278 unsigned long tot_len;
279 char *buf; 279 char *buf;
280 bool may_late_unmap, need_unmap; 280 bool may_late_unmap, need_unmap;
281 struct page **pages_in = cb->compressed_pages;
282 u64 disk_start = cb->start;
283 struct bio *orig_bio = cb->orig_bio;
281 284
282 data_in = kmap(pages_in[0]); 285 data_in = kmap(pages_in[0]);
283 tot_len = read_compress_length(data_in); 286 tot_len = read_compress_length(data_in);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7b40e2e7292a..a3aca495e33e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -663,7 +663,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
663 * wait for all the ordered extents in a root. This is done when balancing 663 * wait for all the ordered extents in a root. This is done when balancing
664 * space between drives. 664 * space between drives.
665 */ 665 */
666int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, 666u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
667 const u64 range_start, const u64 range_len) 667 const u64 range_start, const u64 range_len)
668{ 668{
669 struct btrfs_fs_info *fs_info = root->fs_info; 669 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -671,7 +671,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
671 LIST_HEAD(skipped); 671 LIST_HEAD(skipped);
672 LIST_HEAD(works); 672 LIST_HEAD(works);
673 struct btrfs_ordered_extent *ordered, *next; 673 struct btrfs_ordered_extent *ordered, *next;
674 int count = 0; 674 u64 count = 0;
675 const u64 range_end = range_start + range_len; 675 const u64 range_end = range_start + range_len;
676 676
677 mutex_lock(&root->ordered_extent_mutex); 677 mutex_lock(&root->ordered_extent_mutex);
@@ -701,7 +701,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
701 701
702 cond_resched(); 702 cond_resched();
703 spin_lock(&root->ordered_extent_lock); 703 spin_lock(&root->ordered_extent_lock);
704 if (nr != -1) 704 if (nr != U64_MAX)
705 nr--; 705 nr--;
706 count++; 706 count++;
707 } 707 }
@@ -720,13 +720,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
720 return count; 720 return count;
721} 721}
722 722
723int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, 723u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
724 const u64 range_start, const u64 range_len) 724 const u64 range_start, const u64 range_len)
725{ 725{
726 struct btrfs_root *root; 726 struct btrfs_root *root;
727 struct list_head splice; 727 struct list_head splice;
728 int done; 728 u64 total_done = 0;
729 int total_done = 0; 729 u64 done;
730 730
731 INIT_LIST_HEAD(&splice); 731 INIT_LIST_HEAD(&splice);
732 732
@@ -748,9 +748,8 @@ int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
748 total_done += done; 748 total_done += done;
749 749
750 spin_lock(&fs_info->ordered_root_lock); 750 spin_lock(&fs_info->ordered_root_lock);
751 if (nr != -1) { 751 if (nr != U64_MAX) {
752 nr -= done; 752 nr -= done;
753 WARN_ON(nr < 0);
754 } 753 }
755 } 754 }
756 list_splice_tail(&splice, &fs_info->ordered_roots); 755 list_splice_tail(&splice, &fs_info->ordered_roots);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e0c1d5b8d859..56c4c0ee6381 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -200,9 +200,9 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
200 struct btrfs_ordered_extent *ordered); 200 struct btrfs_ordered_extent *ordered);
201int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, 201int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
202 u32 *sum, int len); 202 u32 *sum, int len);
203int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, 203u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
204 const u64 range_start, const u64 range_len); 204 const u64 range_start, const u64 range_len);
205int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, 205u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
206 const u64 range_start, const u64 range_len); 206 const u64 range_start, const u64 range_len);
207void btrfs_get_logged_extents(struct btrfs_inode *inode, 207void btrfs_get_logged_extents(struct btrfs_inode *inode,
208 struct list_head *logged_list, 208 struct list_head *logged_list,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index cdafbf92ef0c..fcae61e175f3 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -261,8 +261,11 @@ void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l)
261 case BTRFS_BLOCK_GROUP_ITEM_KEY: 261 case BTRFS_BLOCK_GROUP_ITEM_KEY:
262 bi = btrfs_item_ptr(l, i, 262 bi = btrfs_item_ptr(l, i,
263 struct btrfs_block_group_item); 263 struct btrfs_block_group_item);
264 pr_info("\t\tblock group used %llu\n", 264 pr_info(
265 btrfs_disk_block_group_used(l, bi)); 265 "\t\tblock group used %llu chunk_objectid %llu flags %llu\n",
266 btrfs_disk_block_group_used(l, bi),
267 btrfs_disk_block_group_chunk_objectid(l, bi),
268 btrfs_disk_block_group_flags(l, bi));
266 break; 269 break;
267 case BTRFS_CHUNK_ITEM_KEY: 270 case BTRFS_CHUNK_ITEM_KEY:
268 print_chunk(l, btrfs_item_ptr(l, i, 271 print_chunk(l, btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index d6cb155ef7a1..4b23ae5d0e5c 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -164,6 +164,7 @@ static int iterate_object_props(struct btrfs_root *root,
164 size_t), 164 size_t),
165 void *ctx) 165 void *ctx)
166{ 166{
167 struct btrfs_fs_info *fs_info = root->fs_info;
167 int ret; 168 int ret;
168 char *name_buf = NULL; 169 char *name_buf = NULL;
169 char *value_buf = NULL; 170 char *value_buf = NULL;
@@ -214,6 +215,12 @@ static int iterate_object_props(struct btrfs_root *root,
214 name_ptr = (unsigned long)(di + 1); 215 name_ptr = (unsigned long)(di + 1);
215 data_ptr = name_ptr + name_len; 216 data_ptr = name_ptr + name_len;
216 217
218 if (verify_dir_item(fs_info, leaf,
219 path->slots[0], di)) {
220 ret = -EIO;
221 goto out;
222 }
223
217 if (name_len <= XATTR_BTRFS_PREFIX_LEN || 224 if (name_len <= XATTR_BTRFS_PREFIX_LEN ||
218 memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, 225 memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX,
219 name_ptr, 226 name_ptr,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index deffbeb74a0b..4ce351efe281 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1406,38 +1406,6 @@ out:
1406 return ret; 1406 return ret;
1407} 1407}
1408 1408
1409int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
1410 struct btrfs_fs_info *fs_info)
1411{
1412 struct btrfs_qgroup_extent_record *record;
1413 struct btrfs_delayed_ref_root *delayed_refs;
1414 struct rb_node *node;
1415 u64 qgroup_to_skip;
1416 int ret = 0;
1417
1418 delayed_refs = &trans->transaction->delayed_refs;
1419 qgroup_to_skip = delayed_refs->qgroup_to_skip;
1420
1421 /*
1422 * No need to do lock, since this function will only be called in
1423 * btrfs_commit_transaction().
1424 */
1425 node = rb_first(&delayed_refs->dirty_extent_root);
1426 while (node) {
1427 record = rb_entry(node, struct btrfs_qgroup_extent_record,
1428 node);
1429 if (WARN_ON(!record->old_roots))
1430 ret = btrfs_find_all_roots(NULL, fs_info,
1431 record->bytenr, 0, &record->old_roots);
1432 if (ret < 0)
1433 break;
1434 if (qgroup_to_skip)
1435 ulist_del(record->old_roots, qgroup_to_skip, 0);
1436 node = rb_next(node);
1437 }
1438 return ret;
1439}
1440
1441int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, 1409int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
1442 struct btrfs_delayed_ref_root *delayed_refs, 1410 struct btrfs_delayed_ref_root *delayed_refs,
1443 struct btrfs_qgroup_extent_record *record) 1411 struct btrfs_qgroup_extent_record *record)
@@ -1559,6 +1527,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
1559 if (ret) 1527 if (ret)
1560 return ret; 1528 return ret;
1561 } 1529 }
1530 cond_resched();
1562 return 0; 1531 return 0;
1563} 1532}
1564 1533
@@ -1918,6 +1887,35 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
1918 return 0; 1887 return 0;
1919} 1888}
1920 1889
1890/*
1891 * Check if the @roots potentially is a list of fs tree roots
1892 *
1893 * Return 0 for definitely not a fs/subvol tree roots ulist
1894 * Return 1 for possible fs/subvol tree roots in the list (considering an empty
1895 * one as well)
1896 */
1897static int maybe_fs_roots(struct ulist *roots)
1898{
1899 struct ulist_node *unode;
1900 struct ulist_iterator uiter;
1901
1902 /* Empty one, still possible for fs roots */
1903 if (!roots || roots->nnodes == 0)
1904 return 1;
1905
1906 ULIST_ITER_INIT(&uiter);
1907 unode = ulist_next(roots, &uiter);
1908 if (!unode)
1909 return 1;
1910
1911 /*
1912 * If it contains fs tree roots, then it must belong to fs/subvol
1913 * trees.
1914 * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
1915 */
1916 return is_fstree(unode->val);
1917}
1918
1921int 1919int
1922btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, 1920btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
1923 struct btrfs_fs_info *fs_info, 1921 struct btrfs_fs_info *fs_info,
@@ -1934,10 +1932,20 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
1934 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 1932 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
1935 return 0; 1933 return 0;
1936 1934
1937 if (new_roots) 1935 if (new_roots) {
1936 if (!maybe_fs_roots(new_roots))
1937 goto out_free;
1938 nr_new_roots = new_roots->nnodes; 1938 nr_new_roots = new_roots->nnodes;
1939 if (old_roots) 1939 }
1940 if (old_roots) {
1941 if (!maybe_fs_roots(old_roots))
1942 goto out_free;
1940 nr_old_roots = old_roots->nnodes; 1943 nr_old_roots = old_roots->nnodes;
1944 }
1945
1946 /* Quick exit, either not fs tree roots, or won't affect any qgroup */
1947 if (nr_old_roots == 0 && nr_new_roots == 0)
1948 goto out_free;
1941 1949
1942 BUG_ON(!fs_info->quota_root); 1950 BUG_ON(!fs_info->quota_root);
1943 1951
@@ -2017,6 +2025,19 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
2017 2025
2018 if (!ret) { 2026 if (!ret) {
2019 /* 2027 /*
2028 * Old roots should be searched when inserting qgroup
2029 * extent record
2030 */
2031 if (WARN_ON(!record->old_roots)) {
2032 /* Search commit root to find old_roots */
2033 ret = btrfs_find_all_roots(NULL, fs_info,
2034 record->bytenr, 0,
2035 &record->old_roots);
2036 if (ret < 0)
2037 goto cleanup;
2038 }
2039
2040 /*
2020 * Use SEQ_LAST as time_seq to do special search, which 2041 * Use SEQ_LAST as time_seq to do special search, which
2021 * doesn't lock tree or delayed_refs and search current 2042 * doesn't lock tree or delayed_refs and search current
2022 * root. It's safe inside commit_transaction(). 2043 * root. It's safe inside commit_transaction().
@@ -2025,8 +2046,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
2025 record->bytenr, SEQ_LAST, &new_roots); 2046 record->bytenr, SEQ_LAST, &new_roots);
2026 if (ret < 0) 2047 if (ret < 0)
2027 goto cleanup; 2048 goto cleanup;
2028 if (qgroup_to_skip) 2049 if (qgroup_to_skip) {
2029 ulist_del(new_roots, qgroup_to_skip, 0); 2050 ulist_del(new_roots, qgroup_to_skip, 0);
2051 ulist_del(record->old_roots, qgroup_to_skip,
2052 0);
2053 }
2030 ret = btrfs_qgroup_account_extent(trans, fs_info, 2054 ret = btrfs_qgroup_account_extent(trans, fs_info,
2031 record->bytenr, record->num_bytes, 2055 record->bytenr, record->num_bytes,
2032 record->old_roots, new_roots); 2056 record->old_roots, new_roots);
@@ -2338,6 +2362,11 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
2338 2362
2339 if (num_bytes == 0) 2363 if (num_bytes == 0)
2340 return 0; 2364 return 0;
2365
2366 if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
2367 capable(CAP_SYS_RESOURCE))
2368 enforce = false;
2369
2341retry: 2370retry:
2342 spin_lock(&fs_info->qgroup_lock); 2371 spin_lock(&fs_info->qgroup_lock);
2343 quota_root = fs_info->quota_root; 2372 quota_root = fs_info->quota_root;
@@ -2376,7 +2405,7 @@ retry:
2376 ret = btrfs_start_delalloc_inodes(root, 0); 2405 ret = btrfs_start_delalloc_inodes(root, 0);
2377 if (ret) 2406 if (ret)
2378 return ret; 2407 return ret;
2379 btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); 2408 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
2380 trans = btrfs_join_transaction(root); 2409 trans = btrfs_join_transaction(root);
2381 if (IS_ERR(trans)) 2410 if (IS_ERR(trans))
2382 return PTR_ERR(trans); 2411 return PTR_ERR(trans);
@@ -2806,55 +2835,130 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
2806 * Return <0 for error (including -EQUOT) 2835 * Return <0 for error (including -EQUOT)
2807 * 2836 *
2808 * NOTE: this function may sleep for memory allocation. 2837 * NOTE: this function may sleep for memory allocation.
2838 * if btrfs_qgroup_reserve_data() is called multiple times with
2839 * same @reserved, caller must ensure when error happens it's OK
2840 * to free *ALL* reserved space.
2809 */ 2841 */
2810int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) 2842int btrfs_qgroup_reserve_data(struct inode *inode,
2843 struct extent_changeset **reserved_ret, u64 start,
2844 u64 len)
2811{ 2845{
2812 struct btrfs_root *root = BTRFS_I(inode)->root; 2846 struct btrfs_root *root = BTRFS_I(inode)->root;
2813 struct extent_changeset changeset;
2814 struct ulist_node *unode; 2847 struct ulist_node *unode;
2815 struct ulist_iterator uiter; 2848 struct ulist_iterator uiter;
2849 struct extent_changeset *reserved;
2850 u64 orig_reserved;
2851 u64 to_reserve;
2816 int ret; 2852 int ret;
2817 2853
2818 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || 2854 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
2819 !is_fstree(root->objectid) || len == 0) 2855 !is_fstree(root->objectid) || len == 0)
2820 return 0; 2856 return 0;
2821 2857
2822 changeset.bytes_changed = 0; 2858 /* @reserved parameter is mandatory for qgroup */
2823 ulist_init(&changeset.range_changed); 2859 if (WARN_ON(!reserved_ret))
2860 return -EINVAL;
2861 if (!*reserved_ret) {
2862 *reserved_ret = extent_changeset_alloc();
2863 if (!*reserved_ret)
2864 return -ENOMEM;
2865 }
2866 reserved = *reserved_ret;
2867 /* Record already reserved space */
2868 orig_reserved = reserved->bytes_changed;
2824 ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, 2869 ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
2825 start + len -1, EXTENT_QGROUP_RESERVED, &changeset); 2870 start + len -1, EXTENT_QGROUP_RESERVED, reserved);
2871
2872 /* Newly reserved space */
2873 to_reserve = reserved->bytes_changed - orig_reserved;
2826 trace_btrfs_qgroup_reserve_data(inode, start, len, 2874 trace_btrfs_qgroup_reserve_data(inode, start, len,
2827 changeset.bytes_changed, 2875 to_reserve, QGROUP_RESERVE);
2828 QGROUP_RESERVE);
2829 if (ret < 0) 2876 if (ret < 0)
2830 goto cleanup; 2877 goto cleanup;
2831 ret = qgroup_reserve(root, changeset.bytes_changed, true); 2878 ret = qgroup_reserve(root, to_reserve, true);
2832 if (ret < 0) 2879 if (ret < 0)
2833 goto cleanup; 2880 goto cleanup;
2834 2881
2835 ulist_release(&changeset.range_changed);
2836 return ret; 2882 return ret;
2837 2883
2838cleanup: 2884cleanup:
2839 /* cleanup already reserved ranges */ 2885 /* cleanup *ALL* already reserved ranges */
2840 ULIST_ITER_INIT(&uiter); 2886 ULIST_ITER_INIT(&uiter);
2841 while ((unode = ulist_next(&changeset.range_changed, &uiter))) 2887 while ((unode = ulist_next(&reserved->range_changed, &uiter)))
2842 clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, 2888 clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
2843 unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, 2889 unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
2844 GFP_NOFS); 2890 GFP_NOFS);
2845 ulist_release(&changeset.range_changed); 2891 extent_changeset_release(reserved);
2892 return ret;
2893}
2894
2895/* Free ranges specified by @reserved, normally in error path */
2896static int qgroup_free_reserved_data(struct inode *inode,
2897 struct extent_changeset *reserved, u64 start, u64 len)
2898{
2899 struct btrfs_root *root = BTRFS_I(inode)->root;
2900 struct ulist_node *unode;
2901 struct ulist_iterator uiter;
2902 struct extent_changeset changeset;
2903 int freed = 0;
2904 int ret;
2905
2906 extent_changeset_init(&changeset);
2907 len = round_up(start + len, root->fs_info->sectorsize);
2908 start = round_down(start, root->fs_info->sectorsize);
2909
2910 ULIST_ITER_INIT(&uiter);
2911 while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
2912 u64 range_start = unode->val;
2913 /* unode->aux is the inclusive end */
2914 u64 range_len = unode->aux - range_start + 1;
2915 u64 free_start;
2916 u64 free_len;
2917
2918 extent_changeset_release(&changeset);
2919
2920 /* Only free range in range [start, start + len) */
2921 if (range_start >= start + len ||
2922 range_start + range_len <= start)
2923 continue;
2924 free_start = max(range_start, start);
2925 free_len = min(start + len, range_start + range_len) -
2926 free_start;
2927 /*
2928 * TODO: To also modify reserved->ranges_reserved to reflect
2929 * the modification.
2930 *
2931 * However as long as we free qgroup reserved according to
2932 * EXTENT_QGROUP_RESERVED, we won't double free.
2933 * So not need to rush.
2934 */
2935 ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree,
2936 free_start, free_start + free_len - 1,
2937 EXTENT_QGROUP_RESERVED, &changeset);
2938 if (ret < 0)
2939 goto out;
2940 freed += changeset.bytes_changed;
2941 }
2942 btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed);
2943 ret = freed;
2944out:
2945 extent_changeset_release(&changeset);
2846 return ret; 2946 return ret;
2847} 2947}
2848 2948
2849static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, 2949static int __btrfs_qgroup_release_data(struct inode *inode,
2850 int free) 2950 struct extent_changeset *reserved, u64 start, u64 len,
2951 int free)
2851{ 2952{
2852 struct extent_changeset changeset; 2953 struct extent_changeset changeset;
2853 int trace_op = QGROUP_RELEASE; 2954 int trace_op = QGROUP_RELEASE;
2854 int ret; 2955 int ret;
2855 2956
2856 changeset.bytes_changed = 0; 2957 /* In release case, we shouldn't have @reserved */
2857 ulist_init(&changeset.range_changed); 2958 WARN_ON(!free && reserved);
2959 if (free && reserved)
2960 return qgroup_free_reserved_data(inode, reserved, start, len);
2961 extent_changeset_init(&changeset);
2858 ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, 2962 ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
2859 start + len -1, EXTENT_QGROUP_RESERVED, &changeset); 2963 start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
2860 if (ret < 0) 2964 if (ret < 0)
@@ -2868,8 +2972,9 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
2868 btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, 2972 btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
2869 BTRFS_I(inode)->root->objectid, 2973 BTRFS_I(inode)->root->objectid,
2870 changeset.bytes_changed); 2974 changeset.bytes_changed);
2975 ret = changeset.bytes_changed;
2871out: 2976out:
2872 ulist_release(&changeset.range_changed); 2977 extent_changeset_release(&changeset);
2873 return ret; 2978 return ret;
2874} 2979}
2875 2980
@@ -2878,14 +2983,17 @@ out:
2878 * 2983 *
2879 * Should be called when a range of pages get invalidated before reaching disk. 2984 * Should be called when a range of pages get invalidated before reaching disk.
2880 * Or for error cleanup case. 2985 * Or for error cleanup case.
2986 * if @reserved is given, only reserved range in [@start, @start + @len) will
2987 * be freed.
2881 * 2988 *
2882 * For data written to disk, use btrfs_qgroup_release_data(). 2989 * For data written to disk, use btrfs_qgroup_release_data().
2883 * 2990 *
2884 * NOTE: This function may sleep for memory allocation. 2991 * NOTE: This function may sleep for memory allocation.
2885 */ 2992 */
2886int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) 2993int btrfs_qgroup_free_data(struct inode *inode,
2994 struct extent_changeset *reserved, u64 start, u64 len)
2887{ 2995{
2888 return __btrfs_qgroup_release_data(inode, start, len, 1); 2996 return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
2889} 2997}
2890 2998
2891/* 2999/*
@@ -2905,7 +3013,7 @@ int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
2905 */ 3013 */
2906int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) 3014int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
2907{ 3015{
2908 return __btrfs_qgroup_release_data(inode, start, len, 0); 3016 return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
2909} 3017}
2910 3018
2911int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 3019int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
@@ -2969,8 +3077,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
2969 struct ulist_iterator iter; 3077 struct ulist_iterator iter;
2970 int ret; 3078 int ret;
2971 3079
2972 changeset.bytes_changed = 0; 3080 extent_changeset_init(&changeset);
2973 ulist_init(&changeset.range_changed);
2974 ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, 3081 ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
2975 EXTENT_QGROUP_RESERVED, &changeset); 3082 EXTENT_QGROUP_RESERVED, &changeset);
2976 3083
@@ -2987,5 +3094,5 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
2987 changeset.bytes_changed); 3094 changeset.bytes_changed);
2988 3095
2989 } 3096 }
2990 ulist_release(&changeset.range_changed); 3097 extent_changeset_release(&changeset);
2991} 3098}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index fe04d3f295c6..d9984e87cddf 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -134,8 +134,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
134int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); 134int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
135void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); 135void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
136struct btrfs_delayed_extent_op; 136struct btrfs_delayed_extent_op;
137int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, 137
138 struct btrfs_fs_info *fs_info);
139/* 138/*
140 * Inform qgroup to trace one dirty extent, its info is recorded in @record. 139 * Inform qgroup to trace one dirty extent, its info is recorded in @record.
141 * So qgroup can account it at transaction committing time. 140 * So qgroup can account it at transaction committing time.
@@ -243,9 +242,11 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
243#endif 242#endif
244 243
245/* New io_tree based accurate qgroup reserve API */ 244/* New io_tree based accurate qgroup reserve API */
246int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len); 245int btrfs_qgroup_reserve_data(struct inode *inode,
246 struct extent_changeset **reserved, u64 start, u64 len);
247int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); 247int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
248int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len); 248int btrfs_qgroup_free_data(struct inode *inode,
249 struct extent_changeset *reserved, u64 start, u64 len);
249 250
250int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 251int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
251 bool enforce); 252 bool enforce);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index f3d30d9ea8f9..6f845d219cd6 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -31,7 +31,7 @@
31#include <linux/hash.h> 31#include <linux/hash.h>
32#include <linux/list_sort.h> 32#include <linux/list_sort.h>
33#include <linux/raid/xor.h> 33#include <linux/raid/xor.h>
34#include <linux/vmalloc.h> 34#include <linux/mm.h>
35#include <asm/div64.h> 35#include <asm/div64.h>
36#include "ctree.h" 36#include "ctree.h"
37#include "extent_map.h" 37#include "extent_map.h"
@@ -218,12 +218,9 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
218 * of a failing mount. 218 * of a failing mount.
219 */ 219 */
220 table_size = sizeof(*table) + sizeof(*h) * num_entries; 220 table_size = sizeof(*table) + sizeof(*h) * num_entries;
221 table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 221 table = kvzalloc(table_size, GFP_KERNEL);
222 if (!table) { 222 if (!table)
223 table = vzalloc(table_size); 223 return -ENOMEM;
224 if (!table)
225 return -ENOMEM;
226 }
227 224
228 spin_lock_init(&table->cache_lock); 225 spin_lock_init(&table->cache_lock);
229 INIT_LIST_HEAD(&table->stripe_cache); 226 INIT_LIST_HEAD(&table->stripe_cache);
@@ -1101,10 +1098,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1101 } 1098 }
1102 1099
1103 /* put a new bio on the list */ 1100 /* put a new bio on the list */
1104 bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); 1101 bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
1105 if (!bio)
1106 return -ENOMEM;
1107
1108 bio->bi_iter.bi_size = 0; 1102 bio->bi_iter.bi_size = 0;
1109 bio->bi_bdev = stripe->dev->bdev; 1103 bio->bi_bdev = stripe->dev->bdev;
1110 bio->bi_iter.bi_sector = disk_start >> 9; 1104 bio->bi_iter.bi_sector = disk_start >> 9;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index a17e775a4a89..ab852b8e3e37 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -66,7 +66,6 @@ struct reada_extctl {
66struct reada_extent { 66struct reada_extent {
67 u64 logical; 67 u64 logical;
68 struct btrfs_key top; 68 struct btrfs_key top;
69 int err;
70 struct list_head extctl; 69 struct list_head extctl;
71 int refcnt; 70 int refcnt;
72 spinlock_t lock; 71 spinlock_t lock;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d60df51959f7..65661d1aae4e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3093,11 +3093,12 @@ int prealloc_file_extent_cluster(struct inode *inode,
3093 u64 prealloc_start = cluster->start - offset; 3093 u64 prealloc_start = cluster->start - offset;
3094 u64 prealloc_end = cluster->end - offset; 3094 u64 prealloc_end = cluster->end - offset;
3095 u64 cur_offset; 3095 u64 cur_offset;
3096 struct extent_changeset *data_reserved = NULL;
3096 3097
3097 BUG_ON(cluster->start != cluster->boundary[0]); 3098 BUG_ON(cluster->start != cluster->boundary[0]);
3098 inode_lock(inode); 3099 inode_lock(inode);
3099 3100
3100 ret = btrfs_check_data_free_space(inode, prealloc_start, 3101 ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
3101 prealloc_end + 1 - prealloc_start); 3102 prealloc_end + 1 - prealloc_start);
3102 if (ret) 3103 if (ret)
3103 goto out; 3104 goto out;
@@ -3113,8 +3114,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
3113 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 3114 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3114 num_bytes = end + 1 - start; 3115 num_bytes = end + 1 - start;
3115 if (cur_offset < start) 3116 if (cur_offset < start)
3116 btrfs_free_reserved_data_space(inode, cur_offset, 3117 btrfs_free_reserved_data_space(inode, data_reserved,
3117 start - cur_offset); 3118 cur_offset, start - cur_offset);
3118 ret = btrfs_prealloc_file_range(inode, 0, start, 3119 ret = btrfs_prealloc_file_range(inode, 0, start,
3119 num_bytes, num_bytes, 3120 num_bytes, num_bytes,
3120 end + 1, &alloc_hint); 3121 end + 1, &alloc_hint);
@@ -3125,10 +3126,11 @@ int prealloc_file_extent_cluster(struct inode *inode,
3125 nr++; 3126 nr++;
3126 } 3127 }
3127 if (cur_offset < prealloc_end) 3128 if (cur_offset < prealloc_end)
3128 btrfs_free_reserved_data_space(inode, cur_offset, 3129 btrfs_free_reserved_data_space(inode, data_reserved,
3129 prealloc_end + 1 - cur_offset); 3130 cur_offset, prealloc_end + 1 - cur_offset);
3130out: 3131out:
3131 inode_unlock(inode); 3132 inode_unlock(inode);
3133 extent_changeset_free(data_reserved);
3132 return ret; 3134 return ret;
3133} 3135}
3134 3136
@@ -4269,8 +4271,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4269 INIT_LIST_HEAD(&rc->reloc_roots); 4271 INIT_LIST_HEAD(&rc->reloc_roots);
4270 backref_cache_init(&rc->backref_cache); 4272 backref_cache_init(&rc->backref_cache);
4271 mapping_tree_init(&rc->reloc_root_tree); 4273 mapping_tree_init(&rc->reloc_root_tree);
4272 extent_io_tree_init(&rc->processed_blocks, 4274 extent_io_tree_init(&rc->processed_blocks, NULL);
4273 fs_info->btree_inode->i_mapping);
4274 return rc; 4275 return rc;
4275} 4276}
4276 4277
@@ -4372,7 +4373,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
4372 4373
4373 btrfs_wait_block_group_reservations(rc->block_group); 4374 btrfs_wait_block_group_reservations(rc->block_group);
4374 btrfs_wait_nocow_writers(rc->block_group); 4375 btrfs_wait_nocow_writers(rc->block_group);
4375 btrfs_wait_ordered_roots(fs_info, -1, 4376 btrfs_wait_ordered_roots(fs_info, U64_MAX,
4376 rc->block_group->key.objectid, 4377 rc->block_group->key.objectid,
4377 rc->block_group->key.offset); 4378 rc->block_group->key.offset);
4378 4379
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7d6bc308bf43..460db0cb2d07 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -390,6 +390,13 @@ again:
390 WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); 390 WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
391 WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); 391 WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
392 ptr = (unsigned long)(ref + 1); 392 ptr = (unsigned long)(ref + 1);
393 ret = btrfs_is_name_len_valid(leaf, path->slots[0], ptr,
394 name_len);
395 if (!ret) {
396 err = -EIO;
397 goto out;
398 }
399
393 WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); 400 WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
394 *sequence = btrfs_root_ref_sequence(leaf, ref); 401 *sequence = btrfs_root_ref_sequence(leaf, ref);
395 402
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ba5595d19de1..6f1e4c984b94 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/ratelimit.h> 20#include <linux/ratelimit.h>
21#include <linux/sched/mm.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "volumes.h" 23#include "volumes.h"
23#include "disk-io.h" 24#include "disk-io.h"
@@ -161,14 +162,6 @@ struct scrub_parity {
161 unsigned long bitmap[0]; 162 unsigned long bitmap[0];
162}; 163};
163 164
164struct scrub_wr_ctx {
165 struct scrub_bio *wr_curr_bio;
166 struct btrfs_device *tgtdev;
167 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
168 atomic_t flush_all_writes;
169 struct mutex wr_lock;
170};
171
172struct scrub_ctx { 165struct scrub_ctx {
173 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; 166 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
174 struct btrfs_fs_info *fs_info; 167 struct btrfs_fs_info *fs_info;
@@ -183,11 +176,14 @@ struct scrub_ctx {
183 atomic_t cancel_req; 176 atomic_t cancel_req;
184 int readonly; 177 int readonly;
185 int pages_per_rd_bio; 178 int pages_per_rd_bio;
186 u32 sectorsize;
187 u32 nodesize;
188 179
189 int is_dev_replace; 180 int is_dev_replace;
190 struct scrub_wr_ctx wr_ctx; 181
182 struct scrub_bio *wr_curr_bio;
183 struct mutex wr_lock;
184 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
185 atomic_t flush_all_writes;
186 struct btrfs_device *wr_tgtdev;
191 187
192 /* 188 /*
193 * statistics 189 * statistics
@@ -289,10 +285,6 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
289 u64 *extent_physical, 285 u64 *extent_physical,
290 struct btrfs_device **extent_dev, 286 struct btrfs_device **extent_dev,
291 int *extent_mirror_num); 287 int *extent_mirror_num);
292static int scrub_setup_wr_ctx(struct scrub_wr_ctx *wr_ctx,
293 struct btrfs_device *dev,
294 int is_dev_replace);
295static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
296static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 288static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
297 struct scrub_page *spage); 289 struct scrub_page *spage);
298static void scrub_wr_submit(struct scrub_ctx *sctx); 290static void scrub_wr_submit(struct scrub_ctx *sctx);
@@ -643,8 +635,6 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
643 if (!sctx) 635 if (!sctx)
644 return; 636 return;
645 637
646 scrub_free_wr_ctx(&sctx->wr_ctx);
647
648 /* this can happen when scrub is cancelled */ 638 /* this can happen when scrub is cancelled */
649 if (sctx->curr != -1) { 639 if (sctx->curr != -1) {
650 struct scrub_bio *sbio = sctx->bios[sctx->curr]; 640 struct scrub_bio *sbio = sctx->bios[sctx->curr];
@@ -664,6 +654,7 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
664 kfree(sbio); 654 kfree(sbio);
665 } 655 }
666 656
657 kfree(sctx->wr_curr_bio);
667 scrub_free_csums(sctx); 658 scrub_free_csums(sctx);
668 kfree(sctx); 659 kfree(sctx);
669} 660}
@@ -680,7 +671,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
680 struct scrub_ctx *sctx; 671 struct scrub_ctx *sctx;
681 int i; 672 int i;
682 struct btrfs_fs_info *fs_info = dev->fs_info; 673 struct btrfs_fs_info *fs_info = dev->fs_info;
683 int ret;
684 674
685 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); 675 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
686 if (!sctx) 676 if (!sctx)
@@ -710,8 +700,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
710 sctx->bios[i]->next_free = -1; 700 sctx->bios[i]->next_free = -1;
711 } 701 }
712 sctx->first_free = 0; 702 sctx->first_free = 0;
713 sctx->nodesize = fs_info->nodesize;
714 sctx->sectorsize = fs_info->sectorsize;
715 atomic_set(&sctx->bios_in_flight, 0); 703 atomic_set(&sctx->bios_in_flight, 0);
716 atomic_set(&sctx->workers_pending, 0); 704 atomic_set(&sctx->workers_pending, 0);
717 atomic_set(&sctx->cancel_req, 0); 705 atomic_set(&sctx->cancel_req, 0);
@@ -722,12 +710,16 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
722 spin_lock_init(&sctx->stat_lock); 710 spin_lock_init(&sctx->stat_lock);
723 init_waitqueue_head(&sctx->list_wait); 711 init_waitqueue_head(&sctx->list_wait);
724 712
725 ret = scrub_setup_wr_ctx(&sctx->wr_ctx, 713 WARN_ON(sctx->wr_curr_bio != NULL);
726 fs_info->dev_replace.tgtdev, is_dev_replace); 714 mutex_init(&sctx->wr_lock);
727 if (ret) { 715 sctx->wr_curr_bio = NULL;
728 scrub_free_ctx(sctx); 716 if (is_dev_replace) {
729 return ERR_PTR(ret); 717 WARN_ON(!fs_info->dev_replace.tgtdev);
718 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
719 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
720 atomic_set(&sctx->flush_all_writes, 0);
730 } 721 }
722
731 return sctx; 723 return sctx;
732 724
733nomem: 725nomem:
@@ -742,6 +734,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
742 u32 nlink; 734 u32 nlink;
743 int ret; 735 int ret;
744 int i; 736 int i;
737 unsigned nofs_flag;
745 struct extent_buffer *eb; 738 struct extent_buffer *eb;
746 struct btrfs_inode_item *inode_item; 739 struct btrfs_inode_item *inode_item;
747 struct scrub_warning *swarn = warn_ctx; 740 struct scrub_warning *swarn = warn_ctx;
@@ -780,7 +773,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
780 nlink = btrfs_inode_nlink(eb, inode_item); 773 nlink = btrfs_inode_nlink(eb, inode_item);
781 btrfs_release_path(swarn->path); 774 btrfs_release_path(swarn->path);
782 775
776 /*
777 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
778 * uses GFP_NOFS in this context, so we keep it consistent but it does
779 * not seem to be strictly necessary.
780 */
781 nofs_flag = memalloc_nofs_save();
783 ipath = init_ipath(4096, local_root, swarn->path); 782 ipath = init_ipath(4096, local_root, swarn->path);
783 memalloc_nofs_restore(nofs_flag);
784 if (IS_ERR(ipath)) { 784 if (IS_ERR(ipath)) {
785 ret = PTR_ERR(ipath); 785 ret = PTR_ERR(ipath);
786 ipath = NULL; 786 ipath = NULL;
@@ -954,7 +954,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
954 ret = -EIO; 954 ret = -EIO;
955 goto out; 955 goto out;
956 } 956 }
957 ret = repair_io_failure(BTRFS_I(inode), offset, PAGE_SIZE, 957 ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE,
958 fixup->logical, page, 958 fixup->logical, page,
959 offset - page_offset(page), 959 offset - page_offset(page),
960 fixup->mirror_num); 960 fixup->mirror_num);
@@ -1737,12 +1737,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1737 } 1737 }
1738 1738
1739 WARN_ON(!page->page); 1739 WARN_ON(!page->page);
1740 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 1740 bio = btrfs_io_bio_alloc(1);
1741 if (!bio) {
1742 page->io_error = 1;
1743 sblock->no_io_error_seen = 0;
1744 continue;
1745 }
1746 bio->bi_bdev = page->dev->bdev; 1741 bio->bi_bdev = page->dev->bdev;
1747 1742
1748 bio_add_page(bio, page->page, PAGE_SIZE, 0); 1743 bio_add_page(bio, page->page, PAGE_SIZE, 0);
@@ -1830,9 +1825,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1830 return -EIO; 1825 return -EIO;
1831 } 1826 }
1832 1827
1833 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 1828 bio = btrfs_io_bio_alloc(1);
1834 if (!bio)
1835 return -EIO;
1836 bio->bi_bdev = page_bad->dev->bdev; 1829 bio->bi_bdev = page_bad->dev->bdev;
1837 bio->bi_iter.bi_sector = page_bad->physical >> 9; 1830 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1838 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 1831 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -1898,37 +1891,31 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1898static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 1891static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1899 struct scrub_page *spage) 1892 struct scrub_page *spage)
1900{ 1893{
1901 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1902 struct scrub_bio *sbio; 1894 struct scrub_bio *sbio;
1903 int ret; 1895 int ret;
1904 1896
1905 mutex_lock(&wr_ctx->wr_lock); 1897 mutex_lock(&sctx->wr_lock);
1906again: 1898again:
1907 if (!wr_ctx->wr_curr_bio) { 1899 if (!sctx->wr_curr_bio) {
1908 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), 1900 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1909 GFP_KERNEL); 1901 GFP_KERNEL);
1910 if (!wr_ctx->wr_curr_bio) { 1902 if (!sctx->wr_curr_bio) {
1911 mutex_unlock(&wr_ctx->wr_lock); 1903 mutex_unlock(&sctx->wr_lock);
1912 return -ENOMEM; 1904 return -ENOMEM;
1913 } 1905 }
1914 wr_ctx->wr_curr_bio->sctx = sctx; 1906 sctx->wr_curr_bio->sctx = sctx;
1915 wr_ctx->wr_curr_bio->page_count = 0; 1907 sctx->wr_curr_bio->page_count = 0;
1916 } 1908 }
1917 sbio = wr_ctx->wr_curr_bio; 1909 sbio = sctx->wr_curr_bio;
1918 if (sbio->page_count == 0) { 1910 if (sbio->page_count == 0) {
1919 struct bio *bio; 1911 struct bio *bio;
1920 1912
1921 sbio->physical = spage->physical_for_dev_replace; 1913 sbio->physical = spage->physical_for_dev_replace;
1922 sbio->logical = spage->logical; 1914 sbio->logical = spage->logical;
1923 sbio->dev = wr_ctx->tgtdev; 1915 sbio->dev = sctx->wr_tgtdev;
1924 bio = sbio->bio; 1916 bio = sbio->bio;
1925 if (!bio) { 1917 if (!bio) {
1926 bio = btrfs_io_bio_alloc(GFP_KERNEL, 1918 bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1927 wr_ctx->pages_per_wr_bio);
1928 if (!bio) {
1929 mutex_unlock(&wr_ctx->wr_lock);
1930 return -ENOMEM;
1931 }
1932 sbio->bio = bio; 1919 sbio->bio = bio;
1933 } 1920 }
1934 1921
@@ -1951,7 +1938,7 @@ again:
1951 if (sbio->page_count < 1) { 1938 if (sbio->page_count < 1) {
1952 bio_put(sbio->bio); 1939 bio_put(sbio->bio);
1953 sbio->bio = NULL; 1940 sbio->bio = NULL;
1954 mutex_unlock(&wr_ctx->wr_lock); 1941 mutex_unlock(&sctx->wr_lock);
1955 return -EIO; 1942 return -EIO;
1956 } 1943 }
1957 scrub_wr_submit(sctx); 1944 scrub_wr_submit(sctx);
@@ -1961,23 +1948,22 @@ again:
1961 sbio->pagev[sbio->page_count] = spage; 1948 sbio->pagev[sbio->page_count] = spage;
1962 scrub_page_get(spage); 1949 scrub_page_get(spage);
1963 sbio->page_count++; 1950 sbio->page_count++;
1964 if (sbio->page_count == wr_ctx->pages_per_wr_bio) 1951 if (sbio->page_count == sctx->pages_per_wr_bio)
1965 scrub_wr_submit(sctx); 1952 scrub_wr_submit(sctx);
1966 mutex_unlock(&wr_ctx->wr_lock); 1953 mutex_unlock(&sctx->wr_lock);
1967 1954
1968 return 0; 1955 return 0;
1969} 1956}
1970 1957
1971static void scrub_wr_submit(struct scrub_ctx *sctx) 1958static void scrub_wr_submit(struct scrub_ctx *sctx)
1972{ 1959{
1973 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1974 struct scrub_bio *sbio; 1960 struct scrub_bio *sbio;
1975 1961
1976 if (!wr_ctx->wr_curr_bio) 1962 if (!sctx->wr_curr_bio)
1977 return; 1963 return;
1978 1964
1979 sbio = wr_ctx->wr_curr_bio; 1965 sbio = sctx->wr_curr_bio;
1980 wr_ctx->wr_curr_bio = NULL; 1966 sctx->wr_curr_bio = NULL;
1981 WARN_ON(!sbio->bio->bi_bdev); 1967 WARN_ON(!sbio->bio->bi_bdev);
1982 scrub_pending_bio_inc(sctx); 1968 scrub_pending_bio_inc(sctx);
1983 /* process all writes in a single worker thread. Then the block layer 1969 /* process all writes in a single worker thread. Then the block layer
@@ -2081,7 +2067,7 @@ static int scrub_checksum_data(struct scrub_block *sblock)
2081 page = sblock->pagev[0]->page; 2067 page = sblock->pagev[0]->page;
2082 buffer = kmap_atomic(page); 2068 buffer = kmap_atomic(page);
2083 2069
2084 len = sctx->sectorsize; 2070 len = sctx->fs_info->sectorsize;
2085 index = 0; 2071 index = 0;
2086 for (;;) { 2072 for (;;) {
2087 u64 l = min_t(u64, len, PAGE_SIZE); 2073 u64 l = min_t(u64, len, PAGE_SIZE);
@@ -2146,7 +2132,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
2146 BTRFS_UUID_SIZE)) 2132 BTRFS_UUID_SIZE))
2147 sblock->header_error = 1; 2133 sblock->header_error = 1;
2148 2134
2149 len = sctx->nodesize - BTRFS_CSUM_SIZE; 2135 len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
2150 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 2136 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2151 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 2137 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2152 index = 0; 2138 index = 0;
@@ -2329,10 +2315,7 @@ again:
2329 sbio->dev = spage->dev; 2315 sbio->dev = spage->dev;
2330 bio = sbio->bio; 2316 bio = sbio->bio;
2331 if (!bio) { 2317 if (!bio) {
2332 bio = btrfs_io_bio_alloc(GFP_KERNEL, 2318 bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
2333 sctx->pages_per_rd_bio);
2334 if (!bio)
2335 return -ENOMEM;
2336 sbio->bio = bio; 2319 sbio->bio = bio;
2337 } 2320 }
2338 2321
@@ -2420,10 +2403,10 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
2420 scrub_block_put(sblock); 2403 scrub_block_put(sblock);
2421 2404
2422 if (sctx->is_dev_replace && 2405 if (sctx->is_dev_replace &&
2423 atomic_read(&sctx->wr_ctx.flush_all_writes)) { 2406 atomic_read(&sctx->flush_all_writes)) {
2424 mutex_lock(&sctx->wr_ctx.wr_lock); 2407 mutex_lock(&sctx->wr_lock);
2425 scrub_wr_submit(sctx); 2408 scrub_wr_submit(sctx);
2426 mutex_unlock(&sctx->wr_ctx.wr_lock); 2409 mutex_unlock(&sctx->wr_lock);
2427 } 2410 }
2428 2411
2429 scrub_pending_bio_dec(sctx); 2412 scrub_pending_bio_dec(sctx);
@@ -2458,10 +2441,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2458 goto bbio_out; 2441 goto bbio_out;
2459 } 2442 }
2460 2443
2461 bio = btrfs_io_bio_alloc(GFP_NOFS, 0); 2444 bio = btrfs_io_bio_alloc(0);
2462 if (!bio)
2463 goto bbio_out;
2464
2465 bio->bi_iter.bi_sector = logical >> 9; 2445 bio->bi_iter.bi_sector = logical >> 9;
2466 bio->bi_private = sblock; 2446 bio->bi_private = sblock;
2467 bio->bi_end_io = scrub_missing_raid56_end_io; 2447 bio->bi_end_io = scrub_missing_raid56_end_io;
@@ -2628,10 +2608,10 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
2628 spin_unlock(&sctx->list_lock); 2608 spin_unlock(&sctx->list_lock);
2629 2609
2630 if (sctx->is_dev_replace && 2610 if (sctx->is_dev_replace &&
2631 atomic_read(&sctx->wr_ctx.flush_all_writes)) { 2611 atomic_read(&sctx->flush_all_writes)) {
2632 mutex_lock(&sctx->wr_ctx.wr_lock); 2612 mutex_lock(&sctx->wr_lock);
2633 scrub_wr_submit(sctx); 2613 scrub_wr_submit(sctx);
2634 mutex_unlock(&sctx->wr_ctx.wr_lock); 2614 mutex_unlock(&sctx->wr_lock);
2635 } 2615 }
2636 2616
2637 scrub_pending_bio_dec(sctx); 2617 scrub_pending_bio_dec(sctx);
@@ -2726,8 +2706,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2726 if (!sum) 2706 if (!sum)
2727 return 0; 2707 return 0;
2728 2708
2729 index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize; 2709 index = ((u32)(logical - sum->bytenr)) / sctx->fs_info->sectorsize;
2730 num_sectors = sum->len / sctx->sectorsize; 2710 num_sectors = sum->len / sctx->fs_info->sectorsize;
2731 memcpy(csum, sum->sums + index, sctx->csum_size); 2711 memcpy(csum, sum->sums + index, sctx->csum_size);
2732 if (index == num_sectors - 1) { 2712 if (index == num_sectors - 1) {
2733 list_del(&sum->list); 2713 list_del(&sum->list);
@@ -2746,19 +2726,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2746 u32 blocksize; 2726 u32 blocksize;
2747 2727
2748 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2728 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2749 blocksize = sctx->sectorsize; 2729 blocksize = sctx->fs_info->sectorsize;
2750 spin_lock(&sctx->stat_lock); 2730 spin_lock(&sctx->stat_lock);
2751 sctx->stat.data_extents_scrubbed++; 2731 sctx->stat.data_extents_scrubbed++;
2752 sctx->stat.data_bytes_scrubbed += len; 2732 sctx->stat.data_bytes_scrubbed += len;
2753 spin_unlock(&sctx->stat_lock); 2733 spin_unlock(&sctx->stat_lock);
2754 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2734 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2755 blocksize = sctx->nodesize; 2735 blocksize = sctx->fs_info->nodesize;
2756 spin_lock(&sctx->stat_lock); 2736 spin_lock(&sctx->stat_lock);
2757 sctx->stat.tree_extents_scrubbed++; 2737 sctx->stat.tree_extents_scrubbed++;
2758 sctx->stat.tree_bytes_scrubbed += len; 2738 sctx->stat.tree_bytes_scrubbed += len;
2759 spin_unlock(&sctx->stat_lock); 2739 spin_unlock(&sctx->stat_lock);
2760 } else { 2740 } else {
2761 blocksize = sctx->sectorsize; 2741 blocksize = sctx->fs_info->sectorsize;
2762 WARN_ON(1); 2742 WARN_ON(1);
2763 } 2743 }
2764 2744
@@ -2892,11 +2872,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
2892 } 2872 }
2893 2873
2894 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2874 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2895 blocksize = sctx->sectorsize; 2875 blocksize = sctx->fs_info->sectorsize;
2896 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2876 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2897 blocksize = sctx->nodesize; 2877 blocksize = sctx->fs_info->nodesize;
2898 } else { 2878 } else {
2899 blocksize = sctx->sectorsize; 2879 blocksize = sctx->fs_info->sectorsize;
2900 WARN_ON(1); 2880 WARN_ON(1);
2901 } 2881 }
2902 2882
@@ -3037,10 +3017,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
3037 if (ret || !bbio || !bbio->raid_map) 3017 if (ret || !bbio || !bbio->raid_map)
3038 goto bbio_out; 3018 goto bbio_out;
3039 3019
3040 bio = btrfs_io_bio_alloc(GFP_NOFS, 0); 3020 bio = btrfs_io_bio_alloc(0);
3041 if (!bio)
3042 goto bbio_out;
3043
3044 bio->bi_iter.bi_sector = sparity->logic_start >> 9; 3021 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
3045 bio->bi_private = sparity; 3022 bio->bi_private = sparity;
3046 bio->bi_end_io = scrub_parity_bio_endio; 3023 bio->bi_end_io = scrub_parity_bio_endio;
@@ -3305,9 +3282,9 @@ out:
3305 logic_end - logic_start); 3282 logic_end - logic_start);
3306 scrub_parity_put(sparity); 3283 scrub_parity_put(sparity);
3307 scrub_submit(sctx); 3284 scrub_submit(sctx);
3308 mutex_lock(&sctx->wr_ctx.wr_lock); 3285 mutex_lock(&sctx->wr_lock);
3309 scrub_wr_submit(sctx); 3286 scrub_wr_submit(sctx);
3310 mutex_unlock(&sctx->wr_ctx.wr_lock); 3287 mutex_unlock(&sctx->wr_lock);
3311 3288
3312 btrfs_release_path(path); 3289 btrfs_release_path(path);
3313 return ret < 0 ? ret : 0; 3290 return ret < 0 ? ret : 0;
@@ -3463,14 +3440,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3463 */ 3440 */
3464 if (atomic_read(&fs_info->scrub_pause_req)) { 3441 if (atomic_read(&fs_info->scrub_pause_req)) {
3465 /* push queued extents */ 3442 /* push queued extents */
3466 atomic_set(&sctx->wr_ctx.flush_all_writes, 1); 3443 atomic_set(&sctx->flush_all_writes, 1);
3467 scrub_submit(sctx); 3444 scrub_submit(sctx);
3468 mutex_lock(&sctx->wr_ctx.wr_lock); 3445 mutex_lock(&sctx->wr_lock);
3469 scrub_wr_submit(sctx); 3446 scrub_wr_submit(sctx);
3470 mutex_unlock(&sctx->wr_ctx.wr_lock); 3447 mutex_unlock(&sctx->wr_lock);
3471 wait_event(sctx->list_wait, 3448 wait_event(sctx->list_wait,
3472 atomic_read(&sctx->bios_in_flight) == 0); 3449 atomic_read(&sctx->bios_in_flight) == 0);
3473 atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 3450 atomic_set(&sctx->flush_all_writes, 0);
3474 scrub_blocked_if_needed(fs_info); 3451 scrub_blocked_if_needed(fs_info);
3475 } 3452 }
3476 3453
@@ -3677,9 +3654,9 @@ skip:
3677out: 3654out:
3678 /* push queued extents */ 3655 /* push queued extents */
3679 scrub_submit(sctx); 3656 scrub_submit(sctx);
3680 mutex_lock(&sctx->wr_ctx.wr_lock); 3657 mutex_lock(&sctx->wr_lock);
3681 scrub_wr_submit(sctx); 3658 scrub_wr_submit(sctx);
3682 mutex_unlock(&sctx->wr_ctx.wr_lock); 3659 mutex_unlock(&sctx->wr_lock);
3683 3660
3684 blk_finish_plug(&plug); 3661 blk_finish_plug(&plug);
3685 btrfs_free_path(path); 3662 btrfs_free_path(path);
@@ -3859,7 +3836,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3859 */ 3836 */
3860 btrfs_wait_block_group_reservations(cache); 3837 btrfs_wait_block_group_reservations(cache);
3861 btrfs_wait_nocow_writers(cache); 3838 btrfs_wait_nocow_writers(cache);
3862 ret = btrfs_wait_ordered_roots(fs_info, -1, 3839 ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
3863 cache->key.objectid, 3840 cache->key.objectid,
3864 cache->key.offset); 3841 cache->key.offset);
3865 if (ret > 0) { 3842 if (ret > 0) {
@@ -3916,11 +3893,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3916 * write requests are really completed when bios_in_flight 3893 * write requests are really completed when bios_in_flight
3917 * changes to 0. 3894 * changes to 0.
3918 */ 3895 */
3919 atomic_set(&sctx->wr_ctx.flush_all_writes, 1); 3896 atomic_set(&sctx->flush_all_writes, 1);
3920 scrub_submit(sctx); 3897 scrub_submit(sctx);
3921 mutex_lock(&sctx->wr_ctx.wr_lock); 3898 mutex_lock(&sctx->wr_lock);
3922 scrub_wr_submit(sctx); 3899 scrub_wr_submit(sctx);
3923 mutex_unlock(&sctx->wr_ctx.wr_lock); 3900 mutex_unlock(&sctx->wr_lock);
3924 3901
3925 wait_event(sctx->list_wait, 3902 wait_event(sctx->list_wait,
3926 atomic_read(&sctx->bios_in_flight) == 0); 3903 atomic_read(&sctx->bios_in_flight) == 0);
@@ -3934,7 +3911,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3934 */ 3911 */
3935 wait_event(sctx->list_wait, 3912 wait_event(sctx->list_wait,
3936 atomic_read(&sctx->workers_pending) == 0); 3913 atomic_read(&sctx->workers_pending) == 0);
3937 atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 3914 atomic_set(&sctx->flush_all_writes, 0);
3938 3915
3939 scrub_pause_off(fs_info); 3916 scrub_pause_off(fs_info);
3940 3917
@@ -4337,32 +4314,6 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4337 btrfs_put_bbio(bbio); 4314 btrfs_put_bbio(bbio);
4338} 4315}
4339 4316
4340static int scrub_setup_wr_ctx(struct scrub_wr_ctx *wr_ctx,
4341 struct btrfs_device *dev,
4342 int is_dev_replace)
4343{
4344 WARN_ON(wr_ctx->wr_curr_bio != NULL);
4345
4346 mutex_init(&wr_ctx->wr_lock);
4347 wr_ctx->wr_curr_bio = NULL;
4348 if (!is_dev_replace)
4349 return 0;
4350
4351 WARN_ON(!dev->bdev);
4352 wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
4353 wr_ctx->tgtdev = dev;
4354 atomic_set(&wr_ctx->flush_all_writes, 0);
4355 return 0;
4356}
4357
4358static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
4359{
4360 mutex_lock(&wr_ctx->wr_lock);
4361 kfree(wr_ctx->wr_curr_bio);
4362 wr_ctx->wr_curr_bio = NULL;
4363 mutex_unlock(&wr_ctx->wr_lock);
4364}
4365
4366static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 4317static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
4367 int mirror_num, u64 physical_for_dev_replace) 4318 int mirror_num, u64 physical_for_dev_replace)
4368{ 4319{
@@ -4665,7 +4616,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
4665 struct btrfs_device *dev; 4616 struct btrfs_device *dev;
4666 int ret; 4617 int ret;
4667 4618
4668 dev = sctx->wr_ctx.tgtdev; 4619 dev = sctx->wr_tgtdev;
4669 if (!dev) 4620 if (!dev)
4670 return -EIO; 4621 return -EIO;
4671 if (!dev->bdev) { 4622 if (!dev->bdev) {
@@ -4673,13 +4624,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
4673 "scrub write_page_nocow(bdev == NULL) is unexpected"); 4624 "scrub write_page_nocow(bdev == NULL) is unexpected");
4674 return -EIO; 4625 return -EIO;
4675 } 4626 }
4676 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 4627 bio = btrfs_io_bio_alloc(1);
4677 if (!bio) {
4678 spin_lock(&sctx->stat_lock);
4679 sctx->stat.malloc_errors++;
4680 spin_unlock(&sctx->stat_lock);
4681 return -ENOMEM;
4682 }
4683 bio->bi_iter.bi_size = 0; 4628 bio->bi_iter.bi_size = 0;
4684 bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; 4629 bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
4685 bio->bi_bdev = dev->bdev; 4630 bio->bi_bdev = dev->bdev;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fc496a6f842a..e937c10b8287 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1069,6 +1069,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1069 } 1069 }
1070 } 1070 }
1071 1071
1072 ret = btrfs_is_name_len_valid(eb, path->slots[0],
1073 (unsigned long)(di + 1), name_len + data_len);
1074 if (!ret) {
1075 ret = -EIO;
1076 goto out;
1077 }
1072 if (name_len + data_len > buf_len) { 1078 if (name_len + data_len > buf_len) {
1073 buf_len = name_len + data_len; 1079 buf_len = name_len + data_len;
1074 if (is_vmalloc_addr(buf)) { 1080 if (is_vmalloc_addr(buf)) {
@@ -1083,7 +1089,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1083 buf = tmp; 1089 buf = tmp;
1084 } 1090 }
1085 if (!buf) { 1091 if (!buf) {
1086 buf = vmalloc(buf_len); 1092 buf = kvmalloc(buf_len, GFP_KERNEL);
1087 if (!buf) { 1093 if (!buf) {
1088 ret = -ENOMEM; 1094 ret = -ENOMEM;
1089 goto out; 1095 goto out;
@@ -2769,15 +2775,20 @@ out:
2769 2775
2770struct recorded_ref { 2776struct recorded_ref {
2771 struct list_head list; 2777 struct list_head list;
2772 char *dir_path;
2773 char *name; 2778 char *name;
2774 struct fs_path *full_path; 2779 struct fs_path *full_path;
2775 u64 dir; 2780 u64 dir;
2776 u64 dir_gen; 2781 u64 dir_gen;
2777 int dir_path_len;
2778 int name_len; 2782 int name_len;
2779}; 2783};
2780 2784
2785static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
2786{
2787 ref->full_path = path;
2788 ref->name = (char *)kbasename(ref->full_path->start);
2789 ref->name_len = ref->full_path->end - ref->name;
2790}
2791
2781/* 2792/*
2782 * We need to process new refs before deleted refs, but compare_tree gives us 2793 * We need to process new refs before deleted refs, but compare_tree gives us
2783 * everything mixed. So we first record all refs and later process them. 2794 * everything mixed. So we first record all refs and later process them.
@@ -2794,17 +2805,7 @@ static int __record_ref(struct list_head *head, u64 dir,
2794 2805
2795 ref->dir = dir; 2806 ref->dir = dir;
2796 ref->dir_gen = dir_gen; 2807 ref->dir_gen = dir_gen;
2797 ref->full_path = path; 2808 set_ref_path(ref, path);
2798
2799 ref->name = (char *)kbasename(ref->full_path->start);
2800 ref->name_len = ref->full_path->end - ref->name;
2801 ref->dir_path = ref->full_path->start;
2802 if (ref->name == ref->full_path->start)
2803 ref->dir_path_len = 0;
2804 else
2805 ref->dir_path_len = ref->full_path->end -
2806 ref->full_path->start - 1 - ref->name_len;
2807
2808 list_add_tail(&ref->list, head); 2809 list_add_tail(&ref->list, head);
2809 return 0; 2810 return 0;
2810} 2811}
@@ -3546,9 +3547,17 @@ static int is_ancestor(struct btrfs_root *root,
3546 struct fs_path *fs_path) 3547 struct fs_path *fs_path)
3547{ 3548{
3548 u64 ino = ino2; 3549 u64 ino = ino2;
3550 bool free_path = false;
3551 int ret = 0;
3552
3553 if (!fs_path) {
3554 fs_path = fs_path_alloc();
3555 if (!fs_path)
3556 return -ENOMEM;
3557 free_path = true;
3558 }
3549 3559
3550 while (ino > BTRFS_FIRST_FREE_OBJECTID) { 3560 while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3551 int ret;
3552 u64 parent; 3561 u64 parent;
3553 u64 parent_gen; 3562 u64 parent_gen;
3554 3563
@@ -3557,13 +3566,18 @@ static int is_ancestor(struct btrfs_root *root,
3557 if (ret < 0) { 3566 if (ret < 0) {
3558 if (ret == -ENOENT && ino == ino2) 3567 if (ret == -ENOENT && ino == ino2)
3559 ret = 0; 3568 ret = 0;
3560 return ret; 3569 goto out;
3570 }
3571 if (parent == ino1) {
3572 ret = parent_gen == ino1_gen ? 1 : 0;
3573 goto out;
3561 } 3574 }
3562 if (parent == ino1)
3563 return parent_gen == ino1_gen ? 1 : 0;
3564 ino = parent; 3575 ino = parent;
3565 } 3576 }
3566 return 0; 3577 out:
3578 if (free_path)
3579 fs_path_free(fs_path);
3580 return ret;
3567} 3581}
3568 3582
3569static int wait_for_parent_move(struct send_ctx *sctx, 3583static int wait_for_parent_move(struct send_ctx *sctx,
@@ -3686,6 +3700,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
3686 int is_orphan = 0; 3700 int is_orphan = 0;
3687 u64 last_dir_ino_rm = 0; 3701 u64 last_dir_ino_rm = 0;
3688 bool can_rename = true; 3702 bool can_rename = true;
3703 bool orphanized_ancestor = false;
3689 3704
3690 btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino); 3705 btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
3691 3706
@@ -3837,9 +3852,16 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
3837 * might contain the pre-orphanization name of 3852 * might contain the pre-orphanization name of
3838 * ow_inode, which is no longer valid. 3853 * ow_inode, which is no longer valid.
3839 */ 3854 */
3840 fs_path_reset(valid_path); 3855 ret = is_ancestor(sctx->parent_root,
3841 ret = get_cur_path(sctx, sctx->cur_ino, 3856 ow_inode, ow_gen,
3842 sctx->cur_inode_gen, valid_path); 3857 sctx->cur_ino, NULL);
3858 if (ret > 0) {
3859 orphanized_ancestor = true;
3860 fs_path_reset(valid_path);
3861 ret = get_cur_path(sctx, sctx->cur_ino,
3862 sctx->cur_inode_gen,
3863 valid_path);
3864 }
3843 if (ret < 0) 3865 if (ret < 0)
3844 goto out; 3866 goto out;
3845 } else { 3867 } else {
@@ -3960,6 +3982,43 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
3960 if (ret < 0) 3982 if (ret < 0)
3961 goto out; 3983 goto out;
3962 if (!ret) { 3984 if (!ret) {
3985 /*
3986 * If we orphanized any ancestor before, we need
3987 * to recompute the full path for deleted names,
3988 * since any such path was computed before we
3989 * processed any references and orphanized any
3990 * ancestor inode.
3991 */
3992 if (orphanized_ancestor) {
3993 struct fs_path *new_path;
3994
3995 /*
3996 * Our reference's name member points to
3997 * its full_path member string, so we
3998 * use here a new path.
3999 */
4000 new_path = fs_path_alloc();
4001 if (!new_path) {
4002 ret = -ENOMEM;
4003 goto out;
4004 }
4005 ret = get_cur_path(sctx, cur->dir,
4006 cur->dir_gen,
4007 new_path);
4008 if (ret < 0) {
4009 fs_path_free(new_path);
4010 goto out;
4011 }
4012 ret = fs_path_add(new_path,
4013 cur->name,
4014 cur->name_len);
4015 if (ret < 0) {
4016 fs_path_free(new_path);
4017 goto out;
4018 }
4019 fs_path_free(cur->full_path);
4020 set_ref_path(cur, new_path);
4021 }
3963 ret = send_unlink(sctx, cur->full_path); 4022 ret = send_unlink(sctx, cur->full_path);
3964 if (ret < 0) 4023 if (ret < 0)
3965 goto out; 4024 goto out;
@@ -6397,13 +6456,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
6397 6456
6398 alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); 6457 alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
6399 6458
6400 sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); 6459 sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL);
6401 if (!sctx->clone_roots) { 6460 if (!sctx->clone_roots) {
6402 sctx->clone_roots = vzalloc(alloc_size); 6461 ret = -ENOMEM;
6403 if (!sctx->clone_roots) { 6462 goto out;
6404 ret = -ENOMEM;
6405 goto out;
6406 }
6407 } 6463 }
6408 6464
6409 alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); 6465 alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4f1cdd5058f1..74e47794e63f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -601,18 +601,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
601 } 601 }
602 break; 602 break;
603 case Opt_alloc_start: 603 case Opt_alloc_start:
604 num = match_strdup(&args[0]); 604 btrfs_info(info,
605 if (num) { 605 "option alloc_start is obsolete, ignored");
606 mutex_lock(&info->chunk_mutex);
607 info->alloc_start = memparse(num, NULL);
608 mutex_unlock(&info->chunk_mutex);
609 kfree(num);
610 btrfs_info(info, "allocations start at %llu",
611 info->alloc_start);
612 } else {
613 ret = -ENOMEM;
614 goto out;
615 }
616 break; 606 break;
617 case Opt_acl: 607 case Opt_acl:
618#ifdef CONFIG_BTRFS_FS_POSIX_ACL 608#ifdef CONFIG_BTRFS_FS_POSIX_ACL
@@ -1187,7 +1177,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
1187 return 0; 1177 return 0;
1188 } 1178 }
1189 1179
1190 btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); 1180 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
1191 1181
1192 trans = btrfs_attach_transaction_barrier(root); 1182 trans = btrfs_attach_transaction_barrier(root);
1193 if (IS_ERR(trans)) { 1183 if (IS_ERR(trans)) {
@@ -1232,8 +1222,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
1232 seq_puts(seq, ",nobarrier"); 1222 seq_puts(seq, ",nobarrier");
1233 if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) 1223 if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
1234 seq_printf(seq, ",max_inline=%llu", info->max_inline); 1224 seq_printf(seq, ",max_inline=%llu", info->max_inline);
1235 if (info->alloc_start != 0)
1236 seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
1237 if (info->thread_pool_size != min_t(unsigned long, 1225 if (info->thread_pool_size != min_t(unsigned long,
1238 num_online_cpus() + 2, 8)) 1226 num_online_cpus() + 2, 8))
1239 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); 1227 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
@@ -1716,7 +1704,6 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1716 unsigned long old_opts = fs_info->mount_opt; 1704 unsigned long old_opts = fs_info->mount_opt;
1717 unsigned long old_compress_type = fs_info->compress_type; 1705 unsigned long old_compress_type = fs_info->compress_type;
1718 u64 old_max_inline = fs_info->max_inline; 1706 u64 old_max_inline = fs_info->max_inline;
1719 u64 old_alloc_start = fs_info->alloc_start;
1720 int old_thread_pool_size = fs_info->thread_pool_size; 1707 int old_thread_pool_size = fs_info->thread_pool_size;
1721 unsigned int old_metadata_ratio = fs_info->metadata_ratio; 1708 unsigned int old_metadata_ratio = fs_info->metadata_ratio;
1722 int ret; 1709 int ret;
@@ -1855,9 +1842,6 @@ restore:
1855 fs_info->mount_opt = old_opts; 1842 fs_info->mount_opt = old_opts;
1856 fs_info->compress_type = old_compress_type; 1843 fs_info->compress_type = old_compress_type;
1857 fs_info->max_inline = old_max_inline; 1844 fs_info->max_inline = old_max_inline;
1858 mutex_lock(&fs_info->chunk_mutex);
1859 fs_info->alloc_start = old_alloc_start;
1860 mutex_unlock(&fs_info->chunk_mutex);
1861 btrfs_resize_thread_pool(fs_info, 1845 btrfs_resize_thread_pool(fs_info,
1862 old_thread_pool_size, fs_info->thread_pool_size); 1846 old_thread_pool_size, fs_info->thread_pool_size);
1863 fs_info->metadata_ratio = old_metadata_ratio; 1847 fs_info->metadata_ratio = old_metadata_ratio;
@@ -1898,18 +1882,15 @@ static inline void btrfs_descending_sort_devices(
1898static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, 1882static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
1899 u64 *free_bytes) 1883 u64 *free_bytes)
1900{ 1884{
1901 struct btrfs_root *root = fs_info->tree_root;
1902 struct btrfs_device_info *devices_info; 1885 struct btrfs_device_info *devices_info;
1903 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 1886 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
1904 struct btrfs_device *device; 1887 struct btrfs_device *device;
1905 u64 skip_space; 1888 u64 skip_space;
1906 u64 type; 1889 u64 type;
1907 u64 avail_space; 1890 u64 avail_space;
1908 u64 used_space;
1909 u64 min_stripe_size; 1891 u64 min_stripe_size;
1910 int min_stripes = 1, num_stripes = 1; 1892 int min_stripes = 1, num_stripes = 1;
1911 int i = 0, nr_devices; 1893 int i = 0, nr_devices;
1912 int ret;
1913 1894
1914 /* 1895 /*
1915 * We aren't under the device list lock, so this is racy-ish, but good 1896 * We aren't under the device list lock, so this is racy-ish, but good
@@ -1927,12 +1908,12 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
1927 } 1908 }
1928 1909
1929 devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), 1910 devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
1930 GFP_NOFS); 1911 GFP_KERNEL);
1931 if (!devices_info) 1912 if (!devices_info)
1932 return -ENOMEM; 1913 return -ENOMEM;
1933 1914
1934 /* calc min stripe number for data space allocation */ 1915 /* calc min stripe number for data space allocation */
1935 type = btrfs_get_alloc_profile(root, 1); 1916 type = btrfs_data_alloc_profile(fs_info);
1936 if (type & BTRFS_BLOCK_GROUP_RAID0) { 1917 if (type & BTRFS_BLOCK_GROUP_RAID0) {
1937 min_stripes = 2; 1918 min_stripes = 2;
1938 num_stripes = nr_devices; 1919 num_stripes = nr_devices;
@@ -1949,8 +1930,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
1949 else 1930 else
1950 min_stripe_size = BTRFS_STRIPE_LEN; 1931 min_stripe_size = BTRFS_STRIPE_LEN;
1951 1932
1952 if (fs_info->alloc_start)
1953 mutex_lock(&fs_devices->device_list_mutex);
1954 rcu_read_lock(); 1933 rcu_read_lock();
1955 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 1934 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
1956 if (!device->in_fs_metadata || !device->bdev || 1935 if (!device->in_fs_metadata || !device->bdev ||
@@ -1973,34 +1952,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
1973 */ 1952 */
1974 skip_space = SZ_1M; 1953 skip_space = SZ_1M;
1975 1954
1976 /* user can set the offset in fs_info->alloc_start. */
1977 if (fs_info->alloc_start &&
1978 fs_info->alloc_start + BTRFS_STRIPE_LEN <=
1979 device->total_bytes) {
1980 rcu_read_unlock();
1981 skip_space = max(fs_info->alloc_start, skip_space);
1982
1983 /*
1984 * btrfs can not use the free space in
1985 * [0, skip_space - 1], we must subtract it from the
1986 * total. In order to implement it, we account the used
1987 * space in this range first.
1988 */
1989 ret = btrfs_account_dev_extents_size(device, 0,
1990 skip_space - 1,
1991 &used_space);
1992 if (ret) {
1993 kfree(devices_info);
1994 mutex_unlock(&fs_devices->device_list_mutex);
1995 return ret;
1996 }
1997
1998 rcu_read_lock();
1999
2000 /* calc the free space in [0, skip_space - 1] */
2001 skip_space -= used_space;
2002 }
2003
2004 /* 1955 /*
2005 * we can use the free space in [0, skip_space - 1], subtract 1956 * we can use the free space in [0, skip_space - 1], subtract
2006 * it from the total. 1957 * it from the total.
@@ -2019,8 +1970,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
2019 i++; 1970 i++;
2020 } 1971 }
2021 rcu_read_unlock(); 1972 rcu_read_unlock();
2022 if (fs_info->alloc_start)
2023 mutex_unlock(&fs_devices->device_list_mutex);
2024 1973
2025 nr_devices = i; 1974 nr_devices = i;
2026 1975
@@ -2057,10 +2006,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
2057 * multiplier to scale the sizes. 2006 * multiplier to scale the sizes.
2058 * 2007 *
2059 * Unused device space usage is based on simulating the chunk allocator 2008 * Unused device space usage is based on simulating the chunk allocator
2060 * algorithm that respects the device sizes, order of allocations and the 2009 * algorithm that respects the device sizes and order of allocations. This is
2061 * 'alloc_start' value, this is a close approximation of the actual use but 2010 * a close approximation of the actual use but there are other factors that may
2062 * there are other factors that may change the result (like a new metadata 2011 * change the result (like a new metadata chunk).
2063 * chunk).
2064 * 2012 *
2065 * If metadata is exhausted, f_bavail will be 0. 2013 * If metadata is exhausted, f_bavail will be 0.
2066 */ 2014 */
@@ -2243,7 +2191,7 @@ static int btrfs_freeze(struct super_block *sb)
2243 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 2191 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
2244 struct btrfs_root *root = fs_info->tree_root; 2192 struct btrfs_root *root = fs_info->tree_root;
2245 2193
2246 fs_info->fs_frozen = 1; 2194 set_bit(BTRFS_FS_FROZEN, &fs_info->flags);
2247 /* 2195 /*
2248 * We don't need a barrier here, we'll wait for any transaction that 2196 * We don't need a barrier here, we'll wait for any transaction that
2249 * could be in progress on other threads (and do delayed iputs that 2197 * could be in progress on other threads (and do delayed iputs that
@@ -2262,7 +2210,9 @@ static int btrfs_freeze(struct super_block *sb)
2262 2210
2263static int btrfs_unfreeze(struct super_block *sb) 2211static int btrfs_unfreeze(struct super_block *sb)
2264{ 2212{
2265 btrfs_sb(sb)->fs_frozen = 0; 2213 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
2214
2215 clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
2266 return 0; 2216 return 0;
2267} 2217}
2268 2218
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 1f157fba8940..c2d5f3580b4c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -447,11 +447,52 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
447 447
448BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); 448BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
449 449
450static ssize_t quota_override_show(struct kobject *kobj,
451 struct kobj_attribute *a, char *buf)
452{
453 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
454 int quota_override;
455
456 quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
457 return snprintf(buf, PAGE_SIZE, "%d\n", quota_override);
458}
459
460static ssize_t quota_override_store(struct kobject *kobj,
461 struct kobj_attribute *a,
462 const char *buf, size_t len)
463{
464 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
465 unsigned long knob;
466 int err;
467
468 if (!fs_info)
469 return -EPERM;
470
471 if (!capable(CAP_SYS_RESOURCE))
472 return -EPERM;
473
474 err = kstrtoul(buf, 10, &knob);
475 if (err)
476 return err;
477 if (knob > 1)
478 return -EINVAL;
479
480 if (knob)
481 set_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
482 else
483 clear_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
484
485 return len;
486}
487
488BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store);
489
450static const struct attribute *btrfs_attrs[] = { 490static const struct attribute *btrfs_attrs[] = {
451 BTRFS_ATTR_PTR(label), 491 BTRFS_ATTR_PTR(label),
452 BTRFS_ATTR_PTR(nodesize), 492 BTRFS_ATTR_PTR(nodesize),
453 BTRFS_ATTR_PTR(sectorsize), 493 BTRFS_ATTR_PTR(sectorsize),
454 BTRFS_ATTR_PTR(clone_alignment), 494 BTRFS_ATTR_PTR(clone_alignment),
495 BTRFS_ATTR_PTR(quota_override),
455 NULL, 496 NULL,
456}; 497};
457 498
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 133753232a94..d06b1c931d05 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -87,7 +87,7 @@ static int test_find_delalloc(u32 sectorsize)
87 return -ENOMEM; 87 return -ENOMEM;
88 } 88 }
89 89
90 extent_io_tree_init(&tmp, &inode->i_data); 90 extent_io_tree_init(&tmp, inode);
91 91
92 /* 92 /*
93 * First go through and create and mark all of our pages dirty, we pin 93 * First go through and create and mark all of our pages dirty, we pin
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2168654c90a1..f615d59b0489 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -93,7 +93,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
93 btrfs_put_block_group_trimming(cache); 93 btrfs_put_block_group_trimming(cache);
94 btrfs_put_block_group(cache); 94 btrfs_put_block_group(cache);
95 } 95 }
96 kmem_cache_free(btrfs_transaction_cachep, transaction); 96 kfree(transaction);
97 } 97 }
98} 98}
99 99
@@ -228,7 +228,7 @@ loop:
228 */ 228 */
229 BUG_ON(type == TRANS_JOIN_NOLOCK); 229 BUG_ON(type == TRANS_JOIN_NOLOCK);
230 230
231 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 231 cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
232 if (!cur_trans) 232 if (!cur_trans)
233 return -ENOMEM; 233 return -ENOMEM;
234 234
@@ -238,11 +238,11 @@ loop:
238 * someone started a transaction after we unlocked. Make sure 238 * someone started a transaction after we unlocked. Make sure
239 * to redo the checks above 239 * to redo the checks above
240 */ 240 */
241 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 241 kfree(cur_trans);
242 goto loop; 242 goto loop;
243 } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { 243 } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
244 spin_unlock(&fs_info->trans_lock); 244 spin_unlock(&fs_info->trans_lock);
245 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 245 kfree(cur_trans);
246 return -EROFS; 246 return -EROFS;
247 } 247 }
248 248
@@ -294,7 +294,7 @@ loop:
294 spin_lock_init(&cur_trans->dropped_roots_lock); 294 spin_lock_init(&cur_trans->dropped_roots_lock);
295 list_add_tail(&cur_trans->list, &fs_info->trans_list); 295 list_add_tail(&cur_trans->list, &fs_info->trans_list);
296 extent_io_tree_init(&cur_trans->dirty_pages, 296 extent_io_tree_init(&cur_trans->dirty_pages,
297 fs_info->btree_inode->i_mapping); 297 fs_info->btree_inode);
298 fs_info->generation++; 298 fs_info->generation++;
299 cur_trans->transid = fs_info->generation; 299 cur_trans->transid = fs_info->generation;
300 fs_info->running_transaction = cur_trans; 300 fs_info->running_transaction = cur_trans;
@@ -1374,9 +1374,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
1374 ret = commit_fs_roots(trans, fs_info); 1374 ret = commit_fs_roots(trans, fs_info);
1375 if (ret) 1375 if (ret)
1376 goto out; 1376 goto out;
1377 ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
1378 if (ret < 0)
1379 goto out;
1380 ret = btrfs_qgroup_account_extents(trans, fs_info); 1377 ret = btrfs_qgroup_account_extents(trans, fs_info);
1381 if (ret < 0) 1378 if (ret < 0)
1382 goto out; 1379 goto out;
@@ -1926,7 +1923,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1926static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) 1923static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
1927{ 1924{
1928 if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) 1925 if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
1929 btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); 1926 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
1930} 1927}
1931 1928
1932static inline void 1929static inline void
@@ -2180,13 +2177,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
2180 goto scrub_continue; 2177 goto scrub_continue;
2181 } 2178 }
2182 2179
2183 ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
2184 if (ret) {
2185 mutex_unlock(&fs_info->tree_log_mutex);
2186 mutex_unlock(&fs_info->reloc_mutex);
2187 goto scrub_continue;
2188 }
2189
2190 /* 2180 /*
2191 * Since fs roots are all committed, we can get a quite accurate 2181 * Since fs roots are all committed, we can get a quite accurate
2192 * new_roots. So let's do quota accounting. 2182 * new_roots. So let's do quota accounting.
@@ -2314,7 +2304,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
2314 * it'll result in deadlock about SB_FREEZE_FS. 2304 * it'll result in deadlock about SB_FREEZE_FS.
2315 */ 2305 */
2316 if (current != fs_info->transaction_kthread && 2306 if (current != fs_info->transaction_kthread &&
2317 current != fs_info->cleaner_kthread && !fs_info->fs_frozen) 2307 current != fs_info->cleaner_kthread &&
2308 !test_bit(BTRFS_FS_FROZEN, &fs_info->flags))
2318 btrfs_run_delayed_iputs(fs_info); 2309 btrfs_run_delayed_iputs(fs_info);
2319 2310
2320 return ret; 2311 return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ccfe9fe7754a..f20ef211a73d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1175,15 +1175,19 @@ next:
1175 return 0; 1175 return 0;
1176} 1176}
1177 1177
1178static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1178static int extref_get_fields(struct extent_buffer *eb, int slot,
1179 u32 *namelen, char **name, u64 *index, 1179 unsigned long ref_ptr, u32 *namelen, char **name,
1180 u64 *parent_objectid) 1180 u64 *index, u64 *parent_objectid)
1181{ 1181{
1182 struct btrfs_inode_extref *extref; 1182 struct btrfs_inode_extref *extref;
1183 1183
1184 extref = (struct btrfs_inode_extref *)ref_ptr; 1184 extref = (struct btrfs_inode_extref *)ref_ptr;
1185 1185
1186 *namelen = btrfs_inode_extref_name_len(eb, extref); 1186 *namelen = btrfs_inode_extref_name_len(eb, extref);
1187 if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name,
1188 *namelen))
1189 return -EIO;
1190
1187 *name = kmalloc(*namelen, GFP_NOFS); 1191 *name = kmalloc(*namelen, GFP_NOFS);
1188 if (*name == NULL) 1192 if (*name == NULL)
1189 return -ENOMEM; 1193 return -ENOMEM;
@@ -1198,14 +1202,19 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1198 return 0; 1202 return 0;
1199} 1203}
1200 1204
1201static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1205static int ref_get_fields(struct extent_buffer *eb, int slot,
1202 u32 *namelen, char **name, u64 *index) 1206 unsigned long ref_ptr, u32 *namelen, char **name,
1207 u64 *index)
1203{ 1208{
1204 struct btrfs_inode_ref *ref; 1209 struct btrfs_inode_ref *ref;
1205 1210
1206 ref = (struct btrfs_inode_ref *)ref_ptr; 1211 ref = (struct btrfs_inode_ref *)ref_ptr;
1207 1212
1208 *namelen = btrfs_inode_ref_name_len(eb, ref); 1213 *namelen = btrfs_inode_ref_name_len(eb, ref);
1214 if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1),
1215 *namelen))
1216 return -EIO;
1217
1209 *name = kmalloc(*namelen, GFP_NOFS); 1218 *name = kmalloc(*namelen, GFP_NOFS);
1210 if (*name == NULL) 1219 if (*name == NULL)
1211 return -ENOMEM; 1220 return -ENOMEM;
@@ -1280,8 +1289,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1280 1289
1281 while (ref_ptr < ref_end) { 1290 while (ref_ptr < ref_end) {
1282 if (log_ref_ver) { 1291 if (log_ref_ver) {
1283 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1292 ret = extref_get_fields(eb, slot, ref_ptr, &namelen,
1284 &ref_index, &parent_objectid); 1293 &name, &ref_index, &parent_objectid);
1285 /* 1294 /*
1286 * parent object can change from one array 1295 * parent object can change from one array
1287 * item to another. 1296 * item to another.
@@ -1293,8 +1302,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1293 goto out; 1302 goto out;
1294 } 1303 }
1295 } else { 1304 } else {
1296 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1305 ret = ref_get_fields(eb, slot, ref_ptr, &namelen,
1297 &ref_index); 1306 &name, &ref_index);
1298 } 1307 }
1299 if (ret) 1308 if (ret)
1300 goto out; 1309 goto out;
@@ -1841,7 +1850,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1841 ptr_end = ptr + item_size; 1850 ptr_end = ptr + item_size;
1842 while (ptr < ptr_end) { 1851 while (ptr < ptr_end) {
1843 di = (struct btrfs_dir_item *)ptr; 1852 di = (struct btrfs_dir_item *)ptr;
1844 if (verify_dir_item(fs_info, eb, di)) 1853 if (verify_dir_item(fs_info, eb, slot, di))
1845 return -EIO; 1854 return -EIO;
1846 name_len = btrfs_dir_name_len(eb, di); 1855 name_len = btrfs_dir_name_len(eb, di);
1847 ret = replay_one_name(trans, root, path, eb, di, key); 1856 ret = replay_one_name(trans, root, path, eb, di, key);
@@ -2017,7 +2026,7 @@ again:
2017 ptr_end = ptr + item_size; 2026 ptr_end = ptr + item_size;
2018 while (ptr < ptr_end) { 2027 while (ptr < ptr_end) {
2019 di = (struct btrfs_dir_item *)ptr; 2028 di = (struct btrfs_dir_item *)ptr;
2020 if (verify_dir_item(fs_info, eb, di)) { 2029 if (verify_dir_item(fs_info, eb, slot, di)) {
2021 ret = -EIO; 2030 ret = -EIO;
2022 goto out; 2031 goto out;
2023 } 2032 }
@@ -2102,6 +2111,7 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2102 struct btrfs_path *path, 2111 struct btrfs_path *path,
2103 const u64 ino) 2112 const u64 ino)
2104{ 2113{
2114 struct btrfs_fs_info *fs_info = root->fs_info;
2105 struct btrfs_key search_key; 2115 struct btrfs_key search_key;
2106 struct btrfs_path *log_path; 2116 struct btrfs_path *log_path;
2107 int i; 2117 int i;
@@ -2143,6 +2153,12 @@ process_leaf:
2143 u32 this_len = sizeof(*di) + name_len + data_len; 2153 u32 this_len = sizeof(*di) + name_len + data_len;
2144 char *name; 2154 char *name;
2145 2155
2156 ret = verify_dir_item(fs_info, path->nodes[0],
2157 path->slots[0], di);
2158 if (ret) {
2159 ret = -EIO;
2160 goto out;
2161 }
2146 name = kmalloc(name_len, GFP_NOFS); 2162 name = kmalloc(name_len, GFP_NOFS);
2147 if (!name) { 2163 if (!name) {
2148 ret = -ENOMEM; 2164 ret = -ENOMEM;
@@ -4546,6 +4562,12 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4546 this_len = sizeof(*extref) + this_name_len; 4562 this_len = sizeof(*extref) + this_name_len;
4547 } 4563 }
4548 4564
4565 ret = btrfs_is_name_len_valid(eb, slot, name_ptr,
4566 this_name_len);
4567 if (!ret) {
4568 ret = -EIO;
4569 goto out;
4570 }
4549 if (this_name_len > name_len) { 4571 if (this_name_len > name_len) {
4550 char *new_name; 4572 char *new_name;
4551 4573
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 84a495967e0a..5eb7217738ed 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -242,6 +242,17 @@ static struct btrfs_device *__alloc_device(void)
242 if (!dev) 242 if (!dev)
243 return ERR_PTR(-ENOMEM); 243 return ERR_PTR(-ENOMEM);
244 244
245 /*
246 * Preallocate a bio that's always going to be used for flushing device
247 * barriers and matches the device lifespan
248 */
249 dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
250 if (!dev->flush_bio) {
251 kfree(dev);
252 return ERR_PTR(-ENOMEM);
253 }
254 bio_get(dev->flush_bio);
255
245 INIT_LIST_HEAD(&dev->dev_list); 256 INIT_LIST_HEAD(&dev->dev_list);
246 INIT_LIST_HEAD(&dev->dev_alloc_list); 257 INIT_LIST_HEAD(&dev->dev_alloc_list);
247 INIT_LIST_HEAD(&dev->resized_list); 258 INIT_LIST_HEAD(&dev->resized_list);
@@ -838,6 +849,7 @@ static void __free_device(struct work_struct *work)
838 849
839 device = container_of(work, struct btrfs_device, rcu_work); 850 device = container_of(work, struct btrfs_device, rcu_work);
840 rcu_string_free(device->name); 851 rcu_string_free(device->name);
852 bio_put(device->flush_bio);
841 kfree(device); 853 kfree(device);
842} 854}
843 855
@@ -1353,15 +1365,13 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1353 int ret; 1365 int ret;
1354 int slot; 1366 int slot;
1355 struct extent_buffer *l; 1367 struct extent_buffer *l;
1356 u64 min_search_start;
1357 1368
1358 /* 1369 /*
1359 * We don't want to overwrite the superblock on the drive nor any area 1370 * We don't want to overwrite the superblock on the drive nor any area
1360 * used by the boot loader (grub for example), so we make sure to start 1371 * used by the boot loader (grub for example), so we make sure to start
1361 * at an offset of at least 1MB. 1372 * at an offset of at least 1MB.
1362 */ 1373 */
1363 min_search_start = max(fs_info->alloc_start, 1024ull * 1024); 1374 search_start = max_t(u64, search_start, SZ_1M);
1364 search_start = max(search_start, min_search_start);
1365 1375
1366 path = btrfs_alloc_path(); 1376 path = btrfs_alloc_path();
1367 if (!path) 1377 if (!path)
@@ -2387,7 +2397,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
2387 device->io_width = fs_info->sectorsize; 2397 device->io_width = fs_info->sectorsize;
2388 device->io_align = fs_info->sectorsize; 2398 device->io_align = fs_info->sectorsize;
2389 device->sector_size = fs_info->sectorsize; 2399 device->sector_size = fs_info->sectorsize;
2390 device->total_bytes = i_size_read(bdev->bd_inode); 2400 device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2401 fs_info->sectorsize);
2391 device->disk_total_bytes = device->total_bytes; 2402 device->disk_total_bytes = device->total_bytes;
2392 device->commit_total_bytes = device->total_bytes; 2403 device->commit_total_bytes = device->total_bytes;
2393 device->fs_info = fs_info; 2404 device->fs_info = fs_info;
@@ -2417,16 +2428,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
2417 fs_info->fs_devices->total_devices++; 2428 fs_info->fs_devices->total_devices++;
2418 fs_info->fs_devices->total_rw_bytes += device->total_bytes; 2429 fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2419 2430
2420 spin_lock(&fs_info->free_chunk_lock); 2431 atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2421 fs_info->free_chunk_space += device->total_bytes;
2422 spin_unlock(&fs_info->free_chunk_lock);
2423 2432
2424 if (!blk_queue_nonrot(q)) 2433 if (!blk_queue_nonrot(q))
2425 fs_info->fs_devices->rotating = 1; 2434 fs_info->fs_devices->rotating = 1;
2426 2435
2427 tmp = btrfs_super_total_bytes(fs_info->super_copy); 2436 tmp = btrfs_super_total_bytes(fs_info->super_copy);
2428 btrfs_set_super_total_bytes(fs_info->super_copy, 2437 btrfs_set_super_total_bytes(fs_info->super_copy,
2429 tmp + device->total_bytes); 2438 round_down(tmp + device->total_bytes, fs_info->sectorsize));
2430 2439
2431 tmp = btrfs_super_num_devices(fs_info->super_copy); 2440 tmp = btrfs_super_num_devices(fs_info->super_copy);
2432 btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1); 2441 btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
@@ -2574,7 +2583,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2574 goto error; 2583 goto error;
2575 } 2584 }
2576 2585
2577 name = rcu_string_strdup(device_path, GFP_NOFS); 2586 name = rcu_string_strdup(device_path, GFP_KERNEL);
2578 if (!name) { 2587 if (!name) {
2579 kfree(device); 2588 kfree(device);
2580 ret = -ENOMEM; 2589 ret = -ENOMEM;
@@ -2689,6 +2698,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
2689 if (!device->writeable) 2698 if (!device->writeable)
2690 return -EACCES; 2699 return -EACCES;
2691 2700
2701 new_size = round_down(new_size, fs_info->sectorsize);
2702
2692 mutex_lock(&fs_info->chunk_mutex); 2703 mutex_lock(&fs_info->chunk_mutex);
2693 old_total = btrfs_super_total_bytes(super_copy); 2704 old_total = btrfs_super_total_bytes(super_copy);
2694 diff = new_size - device->total_bytes; 2705 diff = new_size - device->total_bytes;
@@ -2701,7 +2712,8 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
2701 2712
2702 fs_devices = fs_info->fs_devices; 2713 fs_devices = fs_info->fs_devices;
2703 2714
2704 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2715 btrfs_set_super_total_bytes(super_copy,
2716 round_down(old_total + diff, fs_info->sectorsize));
2705 device->fs_devices->total_rw_bytes += diff; 2717 device->fs_devices->total_rw_bytes += diff;
2706 2718
2707 btrfs_device_set_total_bytes(device, new_size); 2719 btrfs_device_set_total_bytes(device, new_size);
@@ -2874,9 +2886,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2874 mutex_lock(&fs_info->chunk_mutex); 2886 mutex_lock(&fs_info->chunk_mutex);
2875 btrfs_device_set_bytes_used(device, 2887 btrfs_device_set_bytes_used(device,
2876 device->bytes_used - dev_extent_len); 2888 device->bytes_used - dev_extent_len);
2877 spin_lock(&fs_info->free_chunk_lock); 2889 atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2878 fs_info->free_chunk_space += dev_extent_len;
2879 spin_unlock(&fs_info->free_chunk_lock);
2880 btrfs_clear_space_info_full(fs_info); 2890 btrfs_clear_space_info_full(fs_info);
2881 mutex_unlock(&fs_info->chunk_mutex); 2891 mutex_unlock(&fs_info->chunk_mutex);
2882 } 2892 }
@@ -4393,7 +4403,10 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4393 struct btrfs_super_block *super_copy = fs_info->super_copy; 4403 struct btrfs_super_block *super_copy = fs_info->super_copy;
4394 u64 old_total = btrfs_super_total_bytes(super_copy); 4404 u64 old_total = btrfs_super_total_bytes(super_copy);
4395 u64 old_size = btrfs_device_get_total_bytes(device); 4405 u64 old_size = btrfs_device_get_total_bytes(device);
4396 u64 diff = old_size - new_size; 4406 u64 diff;
4407
4408 new_size = round_down(new_size, fs_info->sectorsize);
4409 diff = old_size - new_size;
4397 4410
4398 if (device->is_tgtdev_for_dev_replace) 4411 if (device->is_tgtdev_for_dev_replace)
4399 return -EINVAL; 4412 return -EINVAL;
@@ -4409,9 +4422,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4409 btrfs_device_set_total_bytes(device, new_size); 4422 btrfs_device_set_total_bytes(device, new_size);
4410 if (device->writeable) { 4423 if (device->writeable) {
4411 device->fs_devices->total_rw_bytes -= diff; 4424 device->fs_devices->total_rw_bytes -= diff;
4412 spin_lock(&fs_info->free_chunk_lock); 4425 atomic64_sub(diff, &fs_info->free_chunk_space);
4413 fs_info->free_chunk_space -= diff;
4414 spin_unlock(&fs_info->free_chunk_lock);
4415 } 4426 }
4416 mutex_unlock(&fs_info->chunk_mutex); 4427 mutex_unlock(&fs_info->chunk_mutex);
4417 4428
@@ -4522,7 +4533,8 @@ again:
4522 &fs_info->fs_devices->resized_devices); 4533 &fs_info->fs_devices->resized_devices);
4523 4534
4524 WARN_ON(diff > old_total); 4535 WARN_ON(diff > old_total);
4525 btrfs_set_super_total_bytes(super_copy, old_total - diff); 4536 btrfs_set_super_total_bytes(super_copy,
4537 round_down(old_total - diff, fs_info->sectorsize));
4526 mutex_unlock(&fs_info->chunk_mutex); 4538 mutex_unlock(&fs_info->chunk_mutex);
4527 4539
4528 /* Now btrfs_update_device() will change the on-disk size. */ 4540 /* Now btrfs_update_device() will change the on-disk size. */
@@ -4535,9 +4547,7 @@ done:
4535 btrfs_device_set_total_bytes(device, old_size); 4547 btrfs_device_set_total_bytes(device, old_size);
4536 if (device->writeable) 4548 if (device->writeable)
4537 device->fs_devices->total_rw_bytes += diff; 4549 device->fs_devices->total_rw_bytes += diff;
4538 spin_lock(&fs_info->free_chunk_lock); 4550 atomic64_add(diff, &fs_info->free_chunk_space);
4539 fs_info->free_chunk_space += diff;
4540 spin_unlock(&fs_info->free_chunk_lock);
4541 mutex_unlock(&fs_info->chunk_mutex); 4551 mutex_unlock(&fs_info->chunk_mutex);
4542 } 4552 }
4543 return ret; 4553 return ret;
@@ -4882,9 +4892,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4882 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); 4892 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
4883 } 4893 }
4884 4894
4885 spin_lock(&info->free_chunk_lock); 4895 atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
4886 info->free_chunk_space -= (stripe_size * map->num_stripes);
4887 spin_unlock(&info->free_chunk_lock);
4888 4896
4889 free_extent_map(em); 4897 free_extent_map(em);
4890 check_raid56_incompat_flag(info, type); 4898 check_raid56_incompat_flag(info, type);
@@ -5029,20 +5037,19 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
5029static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 5037static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
5030 struct btrfs_fs_info *fs_info) 5038 struct btrfs_fs_info *fs_info)
5031{ 5039{
5032 struct btrfs_root *extent_root = fs_info->extent_root;
5033 u64 chunk_offset; 5040 u64 chunk_offset;
5034 u64 sys_chunk_offset; 5041 u64 sys_chunk_offset;
5035 u64 alloc_profile; 5042 u64 alloc_profile;
5036 int ret; 5043 int ret;
5037 5044
5038 chunk_offset = find_next_chunk(fs_info); 5045 chunk_offset = find_next_chunk(fs_info);
5039 alloc_profile = btrfs_get_alloc_profile(extent_root, 0); 5046 alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5040 ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile); 5047 ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
5041 if (ret) 5048 if (ret)
5042 return ret; 5049 return ret;
5043 5050
5044 sys_chunk_offset = find_next_chunk(fs_info); 5051 sys_chunk_offset = find_next_chunk(fs_info);
5045 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 5052 alloc_profile = btrfs_system_alloc_profile(fs_info);
5046 ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile); 5053 ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
5047 return ret; 5054 return ret;
5048} 5055}
@@ -6267,10 +6274,9 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6267 continue; 6274 continue;
6268 } 6275 }
6269 6276
6270 if (dev_nr < total_devs - 1) { 6277 if (dev_nr < total_devs - 1)
6271 bio = btrfs_bio_clone(first_bio, GFP_NOFS); 6278 bio = btrfs_bio_clone(first_bio);
6272 BUG_ON(!bio); /* -ENOMEM */ 6279 else
6273 } else
6274 bio = first_bio; 6280 bio = first_bio;
6275 6281
6276 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, 6282 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
@@ -6685,10 +6691,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
6685 device->in_fs_metadata = 1; 6691 device->in_fs_metadata = 1;
6686 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 6692 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
6687 device->fs_devices->total_rw_bytes += device->total_bytes; 6693 device->fs_devices->total_rw_bytes += device->total_bytes;
6688 spin_lock(&fs_info->free_chunk_lock); 6694 atomic64_add(device->total_bytes - device->bytes_used,
6689 fs_info->free_chunk_space += device->total_bytes - 6695 &fs_info->free_chunk_space);
6690 device->bytes_used;
6691 spin_unlock(&fs_info->free_chunk_lock);
6692 } 6696 }
6693 ret = 0; 6697 ret = 0;
6694 return ret; 6698 return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index c7d0fbc915ca..6f45fd60d15a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -74,6 +74,8 @@ struct btrfs_device {
74 int missing; 74 int missing;
75 int can_discard; 75 int can_discard;
76 int is_tgtdev_for_dev_replace; 76 int is_tgtdev_for_dev_replace;
77 int last_flush_error;
78 int flush_bio_sent;
77 79
78#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED 80#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
79 seqcount_t data_seqcount; 81 seqcount_t data_seqcount;
@@ -279,6 +281,11 @@ struct btrfs_io_bio {
279 u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; 281 u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
280 u8 *csum_allocated; 282 u8 *csum_allocated;
281 btrfs_io_bio_end_io_t *end_io; 283 btrfs_io_bio_end_io_t *end_io;
284 struct bvec_iter iter;
285 /*
286 * This member must come last, bio_alloc_bioset will allocate enough
287 * bytes for entire btrfs_io_bio but relies on bio being last.
288 */
282 struct bio bio; 289 struct bio bio;
283}; 290};
284 291
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index b3cbf80c5acf..2c7e53f9ff1b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -336,7 +336,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
336 u32 this_len = sizeof(*di) + name_len + data_len; 336 u32 this_len = sizeof(*di) + name_len + data_len;
337 unsigned long name_ptr = (unsigned long)(di + 1); 337 unsigned long name_ptr = (unsigned long)(di + 1);
338 338
339 if (verify_dir_item(fs_info, leaf, di)) { 339 if (verify_dir_item(fs_info, leaf, slot, di)) {
340 ret = -EIO; 340 ret = -EIO;
341 goto err; 341 goto err;
342 } 342 }
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 135b10823c6d..c248f9286366 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -24,12 +24,13 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/zlib.h> 25#include <linux/zlib.h>
26#include <linux/zutil.h> 26#include <linux/zutil.h>
27#include <linux/vmalloc.h> 27#include <linux/mm.h>
28#include <linux/init.h> 28#include <linux/init.h>
29#include <linux/err.h> 29#include <linux/err.h>
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/pagemap.h> 31#include <linux/pagemap.h>
32#include <linux/bio.h> 32#include <linux/bio.h>
33#include <linux/refcount.h>
33#include "compression.h" 34#include "compression.h"
34 35
35struct workspace { 36struct workspace {
@@ -42,7 +43,7 @@ static void zlib_free_workspace(struct list_head *ws)
42{ 43{
43 struct workspace *workspace = list_entry(ws, struct workspace, list); 44 struct workspace *workspace = list_entry(ws, struct workspace, list);
44 45
45 vfree(workspace->strm.workspace); 46 kvfree(workspace->strm.workspace);
46 kfree(workspace->buf); 47 kfree(workspace->buf);
47 kfree(workspace); 48 kfree(workspace);
48} 49}
@@ -52,14 +53,14 @@ static struct list_head *zlib_alloc_workspace(void)
52 struct workspace *workspace; 53 struct workspace *workspace;
53 int workspacesize; 54 int workspacesize;
54 55
55 workspace = kzalloc(sizeof(*workspace), GFP_NOFS); 56 workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
56 if (!workspace) 57 if (!workspace)
57 return ERR_PTR(-ENOMEM); 58 return ERR_PTR(-ENOMEM);
58 59
59 workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), 60 workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
60 zlib_inflate_workspacesize()); 61 zlib_inflate_workspacesize());
61 workspace->strm.workspace = vmalloc(workspacesize); 62 workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
62 workspace->buf = kmalloc(PAGE_SIZE, GFP_NOFS); 63 workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
63 if (!workspace->strm.workspace || !workspace->buf) 64 if (!workspace->strm.workspace || !workspace->buf)
64 goto fail; 65 goto fail;
65 66
@@ -211,10 +212,7 @@ out:
211 return ret; 212 return ret;
212} 213}
213 214
214static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in, 215static int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
215 u64 disk_start,
216 struct bio *orig_bio,
217 size_t srclen)
218{ 216{
219 struct workspace *workspace = list_entry(ws, struct workspace, list); 217 struct workspace *workspace = list_entry(ws, struct workspace, list);
220 int ret = 0, ret2; 218 int ret = 0, ret2;
@@ -222,8 +220,12 @@ static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in,
222 char *data_in; 220 char *data_in;
223 size_t total_out = 0; 221 size_t total_out = 0;
224 unsigned long page_in_index = 0; 222 unsigned long page_in_index = 0;
223 size_t srclen = cb->compressed_len;
225 unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); 224 unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
226 unsigned long buf_start; 225 unsigned long buf_start;
226 struct page **pages_in = cb->compressed_pages;
227 u64 disk_start = cb->start;
228 struct bio *orig_bio = cb->orig_bio;
227 229
228 data_in = kmap(pages_in[page_in_index]); 230 data_in = kmap(pages_in[page_in_index]);
229 workspace->strm.next_in = data_in; 231 workspace->strm.next_in = data_in;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index e37973526153..cd99a3658156 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1410,42 +1410,6 @@ DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
1410 TP_ARGS(wq) 1410 TP_ARGS(wq)
1411); 1411);
1412 1412
1413DECLARE_EVENT_CLASS(btrfs__qgroup_data_map,
1414
1415 TP_PROTO(struct inode *inode, u64 free_reserved),
1416
1417 TP_ARGS(inode, free_reserved),
1418
1419 TP_STRUCT__entry_btrfs(
1420 __field( u64, rootid )
1421 __field( unsigned long, ino )
1422 __field( u64, free_reserved )
1423 ),
1424
1425 TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
1426 __entry->rootid = BTRFS_I(inode)->root->objectid;
1427 __entry->ino = inode->i_ino;
1428 __entry->free_reserved = free_reserved;
1429 ),
1430
1431 TP_printk_btrfs("rootid=%llu ino=%lu free_reserved=%llu",
1432 __entry->rootid, __entry->ino, __entry->free_reserved)
1433);
1434
1435DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_init_data_rsv_map,
1436
1437 TP_PROTO(struct inode *inode, u64 free_reserved),
1438
1439 TP_ARGS(inode, free_reserved)
1440);
1441
1442DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_free_data_rsv_map,
1443
1444 TP_PROTO(struct inode *inode, u64 free_reserved),
1445
1446 TP_ARGS(inode, free_reserved)
1447);
1448
1449#define BTRFS_QGROUP_OPERATIONS \ 1413#define BTRFS_QGROUP_OPERATIONS \
1450 { QGROUP_RESERVE, "reserve" }, \ 1414 { QGROUP_RESERVE, "reserve" }, \
1451 { QGROUP_RELEASE, "release" }, \ 1415 { QGROUP_RELEASE, "release" }, \
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index a456e5309238..9aa74f317747 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -426,31 +426,54 @@ struct btrfs_ioctl_ino_lookup_args {
426 char name[BTRFS_INO_LOOKUP_PATH_MAX]; 426 char name[BTRFS_INO_LOOKUP_PATH_MAX];
427}; 427};
428 428
429/* Search criteria for the btrfs SEARCH ioctl family. */
429struct btrfs_ioctl_search_key { 430struct btrfs_ioctl_search_key {
430 /* which root are we searching. 0 is the tree of tree roots */ 431 /*
431 __u64 tree_id; 432 * The tree we're searching in. 1 is the tree of tree roots, 2 is the
432 433 * extent tree, etc...
433 /* keys returned will be >= min and <= max */ 434 *
434 __u64 min_objectid; 435 * A special tree_id value of 0 will cause a search in the subvolume
435 __u64 max_objectid; 436 * tree that the inode which is passed to the ioctl is part of.
436 437 */
437 /* keys returned will be >= min and <= max */ 438 __u64 tree_id; /* in */
438 __u64 min_offset;
439 __u64 max_offset;
440
441 /* max and min transids to search for */
442 __u64 min_transid;
443 __u64 max_transid;
444 439
445 /* keys returned will be >= min and <= max */ 440 /*
446 __u32 min_type; 441 * When doing a tree search, we're actually taking a slice from a
447 __u32 max_type; 442 * linear search space of 136-bit keys.
443 *
444 * A full 136-bit tree key is composed as:
445 * (objectid << 72) + (type << 64) + offset
446 *
447 * The individual min and max values for objectid, type and offset
448 * define the min_key and max_key values for the search range. All
449 * metadata items with a key in the interval [min_key, max_key] will be
450 * returned.
451 *
452 * Additionally, we can filter the items returned on transaction id of
453 * the metadata block they're stored in by specifying a transid range.
454 * Be aware that this transaction id only denotes when the metadata
455 * page that currently contains the item got written the last time as
456 * result of a COW operation. The number does not have any meaning
457 * related to the transaction in which an individual item that is being
458 * returned was created or changed.
459 */
460 __u64 min_objectid; /* in */
461 __u64 max_objectid; /* in */
462 __u64 min_offset; /* in */
463 __u64 max_offset; /* in */
464 __u64 min_transid; /* in */
465 __u64 max_transid; /* in */
466 __u32 min_type; /* in */
467 __u32 max_type; /* in */
448 468
449 /* 469 /*
450 * how many items did userland ask for, and how many are we 470 * input: The maximum amount of results desired.
451 * returning 471 * output: The actual amount of items returned, restricted by any of:
472 * - reaching the upper bound of the search range
473 * - reaching the input nr_items amount of items
474 * - completely filling the supplied memory buffer
452 */ 475 */
453 __u32 nr_items; 476 __u32 nr_items; /* in/out */
454 477
455 /* align to 64 bits */ 478 /* align to 64 bits */
456 __u32 unused; 479 __u32 unused;