aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig3
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/backref.c17
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/block-rsv.c425
-rw-r--r--fs/btrfs/block-rsv.h101
-rw-r--r--fs/btrfs/btrfs_inode.h22
-rw-r--r--fs/btrfs/check-integrity.c11
-rw-r--r--fs/btrfs/compression.c65
-rw-r--r--fs/btrfs/compression.h3
-rw-r--r--fs/btrfs/ctree.h282
-rw-r--r--fs/btrfs/delalloc-space.c494
-rw-r--r--fs/btrfs/delalloc-space.h23
-rw-r--r--fs/btrfs/delayed-ref.c181
-rw-r--r--fs/btrfs/delayed-ref.h10
-rw-r--r--fs/btrfs/dev-replace.c31
-rw-r--r--fs/btrfs/disk-io.c166
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c2503
-rw-r--r--fs/btrfs/extent_io.c149
-rw-r--r--fs/btrfs/extent_io.h10
-rw-r--r--fs/btrfs/file-item.c43
-rw-r--r--fs/btrfs/file.c28
-rw-r--r--fs/btrfs/free-space-cache.c16
-rw-r--r--fs/btrfs/inode-map.c1
-rw-r--r--fs/btrfs/inode.c109
-rw-r--r--fs/btrfs/ioctl.c23
-rw-r--r--fs/btrfs/locking.c62
-rw-r--r--fs/btrfs/ordered-data.c56
-rw-r--r--fs/btrfs/ordered-data.h8
-rw-r--r--fs/btrfs/print-tree.c6
-rw-r--r--fs/btrfs/props.c8
-rw-r--r--fs/btrfs/qgroup.c24
-rw-r--r--fs/btrfs/raid56.h4
-rw-r--r--fs/btrfs/relocation.c1
-rw-r--r--fs/btrfs/root-tree.c56
-rw-r--r--fs/btrfs/scrub.c50
-rw-r--r--fs/btrfs/send.c16
-rw-r--r--fs/btrfs/space-info.c1094
-rw-r--r--fs/btrfs/space-info.h133
-rw-r--r--fs/btrfs/super.c30
-rw-r--r--fs/btrfs/sysfs.c1
-rw-r--r--fs/btrfs/tests/extent-io-tests.c117
-rw-r--r--fs/btrfs/tests/extent-map-tests.c22
-rw-r--r--fs/btrfs/transaction.c18
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-checker.c11
-rw-r--r--fs/btrfs/tree-log.c40
-rw-r--r--fs/btrfs/volumes.c376
-rw-r--r--fs/btrfs/volumes.h52
50 files changed, 3786 insertions, 3124 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 23537bc8c827..212b4a854f2c 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -2,7 +2,8 @@
2 2
3config BTRFS_FS 3config BTRFS_FS
4 tristate "Btrfs filesystem support" 4 tristate "Btrfs filesystem support"
5 select LIBCRC32C 5 select CRYPTO
6 select CRYPTO_CRC32C
6 select ZLIB_INFLATE 7 select ZLIB_INFLATE
7 select ZLIB_DEFLATE 8 select ZLIB_DEFLATE
8 select LZO_COMPRESS 9 select LZO_COMPRESS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ca693dd554e9..76a843198bcb 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -10,7 +10,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
10 export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ 10 export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
11 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 11 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
12 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ 12 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
13 uuid-tree.o props.o free-space-tree.o tree-checker.o 13 uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
14 block-rsv.o delalloc-space.o
14 15
15btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 16btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
16btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 17btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 982152d3f920..89116afda7a2 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1465,12 +1465,11 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1465 * 1465 *
1466 * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. 1466 * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
1467 */ 1467 */
1468int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) 1468int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
1469 struct ulist *roots, struct ulist *tmp)
1469{ 1470{
1470 struct btrfs_fs_info *fs_info = root->fs_info; 1471 struct btrfs_fs_info *fs_info = root->fs_info;
1471 struct btrfs_trans_handle *trans; 1472 struct btrfs_trans_handle *trans;
1472 struct ulist *tmp = NULL;
1473 struct ulist *roots = NULL;
1474 struct ulist_iterator uiter; 1473 struct ulist_iterator uiter;
1475 struct ulist_node *node; 1474 struct ulist_node *node;
1476 struct seq_list elem = SEQ_LIST_INIT(elem); 1475 struct seq_list elem = SEQ_LIST_INIT(elem);
@@ -1481,12 +1480,8 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
1481 .share_count = 0, 1480 .share_count = 0,
1482 }; 1481 };
1483 1482
1484 tmp = ulist_alloc(GFP_NOFS); 1483 ulist_init(roots);
1485 roots = ulist_alloc(GFP_NOFS); 1484 ulist_init(tmp);
1486 if (!tmp || !roots) {
1487 ret = -ENOMEM;
1488 goto out;
1489 }
1490 1485
1491 trans = btrfs_attach_transaction(root); 1486 trans = btrfs_attach_transaction(root);
1492 if (IS_ERR(trans)) { 1487 if (IS_ERR(trans)) {
@@ -1527,8 +1522,8 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
1527 up_read(&fs_info->commit_root_sem); 1522 up_read(&fs_info->commit_root_sem);
1528 } 1523 }
1529out: 1524out:
1530 ulist_free(tmp); 1525 ulist_release(roots);
1531 ulist_free(roots); 1526 ulist_release(tmp);
1532 return ret; 1527 return ret;
1533} 1528}
1534 1529
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 54d58988483a..777f61dc081e 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -57,7 +57,8 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
57 u64 start_off, struct btrfs_path *path, 57 u64 start_off, struct btrfs_path *path,
58 struct btrfs_inode_extref **ret_extref, 58 struct btrfs_inode_extref **ret_extref,
59 u64 *found_off); 59 u64 *found_off);
60int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr); 60int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
61 struct ulist *roots, struct ulist *tmp_ulist);
61 62
62int __init btrfs_prelim_ref_init(void); 63int __init btrfs_prelim_ref_init(void);
63void __cold btrfs_prelim_ref_exit(void); 64void __cold btrfs_prelim_ref_exit(void);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
new file mode 100644
index 000000000000..698470b9f32d
--- /dev/null
+++ b/fs/btrfs/block-rsv.c
@@ -0,0 +1,425 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#include "ctree.h"
4#include "block-rsv.h"
5#include "space-info.h"
6#include "math.h"
7#include "transaction.h"
8
9static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
10 struct btrfs_block_rsv *block_rsv,
11 struct btrfs_block_rsv *dest, u64 num_bytes,
12 u64 *qgroup_to_release_ret)
13{
14 struct btrfs_space_info *space_info = block_rsv->space_info;
15 u64 qgroup_to_release = 0;
16 u64 ret;
17
18 spin_lock(&block_rsv->lock);
19 if (num_bytes == (u64)-1) {
20 num_bytes = block_rsv->size;
21 qgroup_to_release = block_rsv->qgroup_rsv_size;
22 }
23 block_rsv->size -= num_bytes;
24 if (block_rsv->reserved >= block_rsv->size) {
25 num_bytes = block_rsv->reserved - block_rsv->size;
26 block_rsv->reserved = block_rsv->size;
27 block_rsv->full = 1;
28 } else {
29 num_bytes = 0;
30 }
31 if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
32 qgroup_to_release = block_rsv->qgroup_rsv_reserved -
33 block_rsv->qgroup_rsv_size;
34 block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
35 } else {
36 qgroup_to_release = 0;
37 }
38 spin_unlock(&block_rsv->lock);
39
40 ret = num_bytes;
41 if (num_bytes > 0) {
42 if (dest) {
43 spin_lock(&dest->lock);
44 if (!dest->full) {
45 u64 bytes_to_add;
46
47 bytes_to_add = dest->size - dest->reserved;
48 bytes_to_add = min(num_bytes, bytes_to_add);
49 dest->reserved += bytes_to_add;
50 if (dest->reserved >= dest->size)
51 dest->full = 1;
52 num_bytes -= bytes_to_add;
53 }
54 spin_unlock(&dest->lock);
55 }
56 if (num_bytes)
57 btrfs_space_info_add_old_bytes(fs_info, space_info,
58 num_bytes);
59 }
60 if (qgroup_to_release_ret)
61 *qgroup_to_release_ret = qgroup_to_release;
62 return ret;
63}
64
65int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
66 struct btrfs_block_rsv *dst, u64 num_bytes,
67 bool update_size)
68{
69 int ret;
70
71 ret = btrfs_block_rsv_use_bytes(src, num_bytes);
72 if (ret)
73 return ret;
74
75 btrfs_block_rsv_add_bytes(dst, num_bytes, update_size);
76 return 0;
77}
78
79void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
80{
81 memset(rsv, 0, sizeof(*rsv));
82 spin_lock_init(&rsv->lock);
83 rsv->type = type;
84}
85
86void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
87 struct btrfs_block_rsv *rsv,
88 unsigned short type)
89{
90 btrfs_init_block_rsv(rsv, type);
91 rsv->space_info = btrfs_find_space_info(fs_info,
92 BTRFS_BLOCK_GROUP_METADATA);
93}
94
95struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
96 unsigned short type)
97{
98 struct btrfs_block_rsv *block_rsv;
99
100 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
101 if (!block_rsv)
102 return NULL;
103
104 btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
105 return block_rsv;
106}
107
108void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
109 struct btrfs_block_rsv *rsv)
110{
111 if (!rsv)
112 return;
113 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
114 kfree(rsv);
115}
116
117int btrfs_block_rsv_add(struct btrfs_root *root,
118 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
119 enum btrfs_reserve_flush_enum flush)
120{
121 int ret;
122
123 if (num_bytes == 0)
124 return 0;
125
126 ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
127 if (!ret)
128 btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
129
130 return ret;
131}
132
133int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
134{
135 u64 num_bytes = 0;
136 int ret = -ENOSPC;
137
138 if (!block_rsv)
139 return 0;
140
141 spin_lock(&block_rsv->lock);
142 num_bytes = div_factor(block_rsv->size, min_factor);
143 if (block_rsv->reserved >= num_bytes)
144 ret = 0;
145 spin_unlock(&block_rsv->lock);
146
147 return ret;
148}
149
150int btrfs_block_rsv_refill(struct btrfs_root *root,
151 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
152 enum btrfs_reserve_flush_enum flush)
153{
154 u64 num_bytes = 0;
155 int ret = -ENOSPC;
156
157 if (!block_rsv)
158 return 0;
159
160 spin_lock(&block_rsv->lock);
161 num_bytes = min_reserved;
162 if (block_rsv->reserved >= num_bytes)
163 ret = 0;
164 else
165 num_bytes -= block_rsv->reserved;
166 spin_unlock(&block_rsv->lock);
167
168 if (!ret)
169 return 0;
170
171 ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
172 if (!ret) {
173 btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
174 return 0;
175 }
176
177 return ret;
178}
179
180u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
181 struct btrfs_block_rsv *block_rsv,
182 u64 num_bytes, u64 *qgroup_to_release)
183{
184 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
185 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
186 struct btrfs_block_rsv *target = NULL;
187
188 /*
189 * If we are the delayed_rsv then push to the global rsv, otherwise dump
190 * into the delayed rsv if it is not full.
191 */
192 if (block_rsv == delayed_rsv)
193 target = global_rsv;
194 else if (block_rsv != global_rsv && !delayed_rsv->full)
195 target = delayed_rsv;
196
197 if (target && block_rsv->space_info != target->space_info)
198 target = NULL;
199
200 return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
201 qgroup_to_release);
202}
203
204int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
205{
206 int ret = -ENOSPC;
207
208 spin_lock(&block_rsv->lock);
209 if (block_rsv->reserved >= num_bytes) {
210 block_rsv->reserved -= num_bytes;
211 if (block_rsv->reserved < block_rsv->size)
212 block_rsv->full = 0;
213 ret = 0;
214 }
215 spin_unlock(&block_rsv->lock);
216 return ret;
217}
218
219void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
220 u64 num_bytes, bool update_size)
221{
222 spin_lock(&block_rsv->lock);
223 block_rsv->reserved += num_bytes;
224 if (update_size)
225 block_rsv->size += num_bytes;
226 else if (block_rsv->reserved >= block_rsv->size)
227 block_rsv->full = 1;
228 spin_unlock(&block_rsv->lock);
229}
230
231int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
232 struct btrfs_block_rsv *dest, u64 num_bytes,
233 int min_factor)
234{
235 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
236 u64 min_bytes;
237
238 if (global_rsv->space_info != dest->space_info)
239 return -ENOSPC;
240
241 spin_lock(&global_rsv->lock);
242 min_bytes = div_factor(global_rsv->size, min_factor);
243 if (global_rsv->reserved < min_bytes + num_bytes) {
244 spin_unlock(&global_rsv->lock);
245 return -ENOSPC;
246 }
247 global_rsv->reserved -= num_bytes;
248 if (global_rsv->reserved < global_rsv->size)
249 global_rsv->full = 0;
250 spin_unlock(&global_rsv->lock);
251
252 btrfs_block_rsv_add_bytes(dest, num_bytes, true);
253 return 0;
254}
255
256void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
257{
258 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
259 struct btrfs_space_info *sinfo = block_rsv->space_info;
260 u64 num_bytes;
261
262 /*
263 * The global block rsv is based on the size of the extent tree, the
264 * checksum tree and the root tree. If the fs is empty we want to set
265 * it to a minimal amount for safety.
266 */
267 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
268 btrfs_root_used(&fs_info->csum_root->root_item) +
269 btrfs_root_used(&fs_info->tree_root->root_item);
270 num_bytes = max_t(u64, num_bytes, SZ_16M);
271
272 spin_lock(&sinfo->lock);
273 spin_lock(&block_rsv->lock);
274
275 block_rsv->size = min_t(u64, num_bytes, SZ_512M);
276
277 if (block_rsv->reserved < block_rsv->size) {
278 num_bytes = btrfs_space_info_used(sinfo, true);
279 if (sinfo->total_bytes > num_bytes) {
280 num_bytes = sinfo->total_bytes - num_bytes;
281 num_bytes = min(num_bytes,
282 block_rsv->size - block_rsv->reserved);
283 block_rsv->reserved += num_bytes;
284 btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
285 num_bytes);
286 trace_btrfs_space_reservation(fs_info, "space_info",
287 sinfo->flags, num_bytes,
288 1);
289 }
290 } else if (block_rsv->reserved > block_rsv->size) {
291 num_bytes = block_rsv->reserved - block_rsv->size;
292 btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
293 -num_bytes);
294 trace_btrfs_space_reservation(fs_info, "space_info",
295 sinfo->flags, num_bytes, 0);
296 block_rsv->reserved = block_rsv->size;
297 }
298
299 if (block_rsv->reserved == block_rsv->size)
300 block_rsv->full = 1;
301 else
302 block_rsv->full = 0;
303
304 spin_unlock(&block_rsv->lock);
305 spin_unlock(&sinfo->lock);
306}
307
308void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
309{
310 struct btrfs_space_info *space_info;
311
312 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
313 fs_info->chunk_block_rsv.space_info = space_info;
314
315 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
316 fs_info->global_block_rsv.space_info = space_info;
317 fs_info->trans_block_rsv.space_info = space_info;
318 fs_info->empty_block_rsv.space_info = space_info;
319 fs_info->delayed_block_rsv.space_info = space_info;
320 fs_info->delayed_refs_rsv.space_info = space_info;
321
322 fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
323 fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
324 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
325 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
326 if (fs_info->quota_root)
327 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
328 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
329
330 btrfs_update_global_block_rsv(fs_info);
331}
332
333void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
334{
335 btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1);
336 WARN_ON(fs_info->trans_block_rsv.size > 0);
337 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
338 WARN_ON(fs_info->chunk_block_rsv.size > 0);
339 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
340 WARN_ON(fs_info->delayed_block_rsv.size > 0);
341 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
342 WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
343 WARN_ON(fs_info->delayed_refs_rsv.size > 0);
344}
345
346static struct btrfs_block_rsv *get_block_rsv(
347 const struct btrfs_trans_handle *trans,
348 const struct btrfs_root *root)
349{
350 struct btrfs_fs_info *fs_info = root->fs_info;
351 struct btrfs_block_rsv *block_rsv = NULL;
352
353 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
354 (root == fs_info->csum_root && trans->adding_csums) ||
355 (root == fs_info->uuid_root))
356 block_rsv = trans->block_rsv;
357
358 if (!block_rsv)
359 block_rsv = root->block_rsv;
360
361 if (!block_rsv)
362 block_rsv = &fs_info->empty_block_rsv;
363
364 return block_rsv;
365}
366
367struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
368 struct btrfs_root *root,
369 u32 blocksize)
370{
371 struct btrfs_fs_info *fs_info = root->fs_info;
372 struct btrfs_block_rsv *block_rsv;
373 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
374 int ret;
375 bool global_updated = false;
376
377 block_rsv = get_block_rsv(trans, root);
378
379 if (unlikely(block_rsv->size == 0))
380 goto try_reserve;
381again:
382 ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
383 if (!ret)
384 return block_rsv;
385
386 if (block_rsv->failfast)
387 return ERR_PTR(ret);
388
389 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
390 global_updated = true;
391 btrfs_update_global_block_rsv(fs_info);
392 goto again;
393 }
394
395 /*
396 * The global reserve still exists to save us from ourselves, so don't
397 * warn_on if we are short on our delayed refs reserve.
398 */
399 if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
400 btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
401 static DEFINE_RATELIMIT_STATE(_rs,
402 DEFAULT_RATELIMIT_INTERVAL * 10,
403 /*DEFAULT_RATELIMIT_BURST*/ 1);
404 if (__ratelimit(&_rs))
405 WARN(1, KERN_DEBUG
406 "BTRFS: block rsv returned %d\n", ret);
407 }
408try_reserve:
409 ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
410 BTRFS_RESERVE_NO_FLUSH);
411 if (!ret)
412 return block_rsv;
413 /*
414 * If we couldn't reserve metadata bytes try and use some from
415 * the global reserve if its space type is the same as the global
416 * reservation.
417 */
418 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
419 block_rsv->space_info == global_rsv->space_info) {
420 ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize);
421 if (!ret)
422 return global_rsv;
423 }
424 return ERR_PTR(ret);
425}
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
new file mode 100644
index 000000000000..d1428bb73fc5
--- /dev/null
+++ b/fs/btrfs/block-rsv.h
@@ -0,0 +1,101 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2
3#ifndef BTRFS_BLOCK_RSV_H
4#define BTRFS_BLOCK_RSV_H
5
6struct btrfs_trans_handle;
7enum btrfs_reserve_flush_enum;
8
9/*
10 * Types of block reserves
11 */
12enum {
13 BTRFS_BLOCK_RSV_GLOBAL,
14 BTRFS_BLOCK_RSV_DELALLOC,
15 BTRFS_BLOCK_RSV_TRANS,
16 BTRFS_BLOCK_RSV_CHUNK,
17 BTRFS_BLOCK_RSV_DELOPS,
18 BTRFS_BLOCK_RSV_DELREFS,
19 BTRFS_BLOCK_RSV_EMPTY,
20 BTRFS_BLOCK_RSV_TEMP,
21};
22
23struct btrfs_block_rsv {
24 u64 size;
25 u64 reserved;
26 struct btrfs_space_info *space_info;
27 spinlock_t lock;
28 unsigned short full;
29 unsigned short type;
30 unsigned short failfast;
31
32 /*
33 * Qgroup equivalent for @size @reserved
34 *
35 * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care
36 * about things like csum size nor how many tree blocks it will need to
37 * reserve.
38 *
39 * Qgroup cares more about net change of the extent usage.
40 *
41 * So for one newly inserted file extent, in worst case it will cause
42 * leaf split and level increase, nodesize for each file extent is
43 * already too much.
44 *
45 * In short, qgroup_size/reserved is the upper limit of possible needed
46 * qgroup metadata reservation.
47 */
48 u64 qgroup_rsv_size;
49 u64 qgroup_rsv_reserved;
50};
51
52void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
53struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
54 unsigned short type);
55void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
56 struct btrfs_block_rsv *rsv,
57 unsigned short type);
58void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
59 struct btrfs_block_rsv *rsv);
60int btrfs_block_rsv_add(struct btrfs_root *root,
61 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
62 enum btrfs_reserve_flush_enum flush);
63int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
64int btrfs_block_rsv_refill(struct btrfs_root *root,
65 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
66 enum btrfs_reserve_flush_enum flush);
67int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
68 struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
69 bool update_size);
70int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes);
71int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
72 struct btrfs_block_rsv *dest, u64 num_bytes,
73 int min_factor);
74void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
75 u64 num_bytes, bool update_size);
76u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
77 struct btrfs_block_rsv *block_rsv,
78 u64 num_bytes, u64 *qgroup_to_release);
79void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info);
80void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info);
81void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info);
82struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
83 struct btrfs_root *root,
84 u32 blocksize);
85
86static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
87 struct btrfs_block_rsv *block_rsv,
88 u64 num_bytes)
89{
90 __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
91}
92
93static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
94 struct btrfs_block_rsv *block_rsv,
95 u32 blocksize)
96{
97 btrfs_block_rsv_add_bytes(block_rsv, blocksize, false);
98 btrfs_block_rsv_release(fs_info, block_rsv, 0);
99}
100
101#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d5b438706b77..f853835c409c 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -337,22 +337,34 @@ static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
337 clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); 337 clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
338} 338}
339 339
340/* Array of bytes with variable length, hexadecimal format 0x1234 */
341#define CSUM_FMT "0x%*phN"
342#define CSUM_FMT_VALUE(size, bytes) size, bytes
343
340static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, 344static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
341 u64 logical_start, u32 csum, u32 csum_expected, int mirror_num) 345 u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
342{ 346{
343 struct btrfs_root *root = inode->root; 347 struct btrfs_root *root = inode->root;
348 struct btrfs_super_block *sb = root->fs_info->super_copy;
349 const u16 csum_size = btrfs_super_csum_size(sb);
344 350
345 /* Output minus objectid, which is more meaningful */ 351 /* Output minus objectid, which is more meaningful */
346 if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) 352 if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
347 btrfs_warn_rl(root->fs_info, 353 btrfs_warn_rl(root->fs_info,
348 "csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d", 354"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
349 root->root_key.objectid, btrfs_ino(inode), 355 root->root_key.objectid, btrfs_ino(inode),
350 logical_start, csum, csum_expected, mirror_num); 356 logical_start,
357 CSUM_FMT_VALUE(csum_size, csum),
358 CSUM_FMT_VALUE(csum_size, csum_expected),
359 mirror_num);
351 else 360 else
352 btrfs_warn_rl(root->fs_info, 361 btrfs_warn_rl(root->fs_info,
353 "csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d", 362"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
354 root->root_key.objectid, btrfs_ino(inode), 363 root->root_key.objectid, btrfs_ino(inode),
355 logical_start, csum, csum_expected, mirror_num); 364 logical_start,
365 CSUM_FMT_VALUE(csum_size, csum),
366 CSUM_FMT_VALUE(csum_size, csum_expected),
367 mirror_num);
356} 368}
357 369
358#endif 370#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index b0c8094528d1..81a9731959a9 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -83,7 +83,7 @@
83#include <linux/blkdev.h> 83#include <linux/blkdev.h>
84#include <linux/mm.h> 84#include <linux/mm.h>
85#include <linux/string.h> 85#include <linux/string.h>
86#include <linux/crc32c.h> 86#include <crypto/hash.h>
87#include "ctree.h" 87#include "ctree.h"
88#include "disk-io.h" 88#include "disk-io.h"
89#include "transaction.h" 89#include "transaction.h"
@@ -1710,9 +1710,9 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
1710 char **datav, unsigned int num_pages) 1710 char **datav, unsigned int num_pages)
1711{ 1711{
1712 struct btrfs_fs_info *fs_info = state->fs_info; 1712 struct btrfs_fs_info *fs_info = state->fs_info;
1713 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1713 struct btrfs_header *h; 1714 struct btrfs_header *h;
1714 u8 csum[BTRFS_CSUM_SIZE]; 1715 u8 csum[BTRFS_CSUM_SIZE];
1715 u32 crc = ~(u32)0;
1716 unsigned int i; 1716 unsigned int i;
1717 1717
1718 if (num_pages * PAGE_SIZE < state->metablock_size) 1718 if (num_pages * PAGE_SIZE < state->metablock_size)
@@ -1723,14 +1723,17 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
1723 if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) 1723 if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE))
1724 return 1; 1724 return 1;
1725 1725
1726 shash->tfm = fs_info->csum_shash;
1727 crypto_shash_init(shash);
1728
1726 for (i = 0; i < num_pages; i++) { 1729 for (i = 0; i < num_pages; i++) {
1727 u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); 1730 u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
1728 size_t sublen = i ? PAGE_SIZE : 1731 size_t sublen = i ? PAGE_SIZE :
1729 (PAGE_SIZE - BTRFS_CSUM_SIZE); 1732 (PAGE_SIZE - BTRFS_CSUM_SIZE);
1730 1733
1731 crc = crc32c(crc, data, sublen); 1734 crypto_shash_update(shash, data, sublen);
1732 } 1735 }
1733 btrfs_csum_final(crc, csum); 1736 crypto_shash_final(shash, csum);
1734 if (memcmp(csum, h->csum, state->csum_size)) 1737 if (memcmp(csum, h->csum, state->csum_size))
1735 return 1; 1738 return 1;
1736 1739
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 84dd4a8980c5..60c47b417a4b 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -17,6 +17,7 @@
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/sched/mm.h> 18#include <linux/sched/mm.h>
19#include <linux/log2.h> 19#include <linux/log2.h>
20#include <crypto/hash.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "disk-io.h" 22#include "disk-io.h"
22#include "transaction.h" 23#include "transaction.h"
@@ -42,6 +43,22 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type)
42 return NULL; 43 return NULL;
43} 44}
44 45
46bool btrfs_compress_is_valid_type(const char *str, size_t len)
47{
48 int i;
49
50 for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
51 size_t comp_len = strlen(btrfs_compress_types[i]);
52
53 if (len < comp_len)
54 continue;
55
56 if (!strncmp(btrfs_compress_types[i], str, comp_len))
57 return true;
58 }
59 return false;
60}
61
45static int btrfs_decompress_bio(struct compressed_bio *cb); 62static int btrfs_decompress_bio(struct compressed_bio *cb);
46 63
47static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, 64static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
@@ -57,32 +74,37 @@ static int check_compressed_csum(struct btrfs_inode *inode,
57 struct compressed_bio *cb, 74 struct compressed_bio *cb,
58 u64 disk_start) 75 u64 disk_start)
59{ 76{
77 struct btrfs_fs_info *fs_info = inode->root->fs_info;
78 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
79 const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
60 int ret; 80 int ret;
61 struct page *page; 81 struct page *page;
62 unsigned long i; 82 unsigned long i;
63 char *kaddr; 83 char *kaddr;
64 u32 csum; 84 u8 csum[BTRFS_CSUM_SIZE];
65 u32 *cb_sum = &cb->sums; 85 u8 *cb_sum = cb->sums;
66 86
67 if (inode->flags & BTRFS_INODE_NODATASUM) 87 if (inode->flags & BTRFS_INODE_NODATASUM)
68 return 0; 88 return 0;
69 89
90 shash->tfm = fs_info->csum_shash;
91
70 for (i = 0; i < cb->nr_pages; i++) { 92 for (i = 0; i < cb->nr_pages; i++) {
71 page = cb->compressed_pages[i]; 93 page = cb->compressed_pages[i];
72 csum = ~(u32)0;
73 94
95 crypto_shash_init(shash);
74 kaddr = kmap_atomic(page); 96 kaddr = kmap_atomic(page);
75 csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE); 97 crypto_shash_update(shash, kaddr, PAGE_SIZE);
76 btrfs_csum_final(csum, (u8 *)&csum);
77 kunmap_atomic(kaddr); 98 kunmap_atomic(kaddr);
99 crypto_shash_final(shash, (u8 *)&csum);
78 100
79 if (csum != *cb_sum) { 101 if (memcmp(&csum, cb_sum, csum_size)) {
80 btrfs_print_data_csum_error(inode, disk_start, csum, 102 btrfs_print_data_csum_error(inode, disk_start,
81 *cb_sum, cb->mirror_num); 103 csum, cb_sum, cb->mirror_num);
82 ret = -EIO; 104 ret = -EIO;
83 goto fail; 105 goto fail;
84 } 106 }
85 cb_sum++; 107 cb_sum += csum_size;
86 108
87 } 109 }
88 ret = 0; 110 ret = 0;
@@ -318,7 +340,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
318 340
319 bdev = fs_info->fs_devices->latest_bdev; 341 bdev = fs_info->fs_devices->latest_bdev;
320 342
321 bio = btrfs_bio_alloc(bdev, first_byte); 343 bio = btrfs_bio_alloc(first_byte);
344 bio_set_dev(bio, bdev);
322 bio->bi_opf = REQ_OP_WRITE | write_flags; 345 bio->bi_opf = REQ_OP_WRITE | write_flags;
323 bio->bi_private = cb; 346 bio->bi_private = cb;
324 bio->bi_end_io = end_compressed_bio_write; 347 bio->bi_end_io = end_compressed_bio_write;
@@ -360,7 +383,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
360 bio_endio(bio); 383 bio_endio(bio);
361 } 384 }
362 385
363 bio = btrfs_bio_alloc(bdev, first_byte); 386 bio = btrfs_bio_alloc(first_byte);
387 bio_set_dev(bio, bdev);
364 bio->bi_opf = REQ_OP_WRITE | write_flags; 388 bio->bi_opf = REQ_OP_WRITE | write_flags;
365 bio->bi_private = cb; 389 bio->bi_private = cb;
366 bio->bi_end_io = end_compressed_bio_write; 390 bio->bi_end_io = end_compressed_bio_write;
@@ -536,7 +560,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
536 struct extent_map *em; 560 struct extent_map *em;
537 blk_status_t ret = BLK_STS_RESOURCE; 561 blk_status_t ret = BLK_STS_RESOURCE;
538 int faili = 0; 562 int faili = 0;
539 u32 *sums; 563 const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
564 u8 *sums;
540 565
541 em_tree = &BTRFS_I(inode)->extent_tree; 566 em_tree = &BTRFS_I(inode)->extent_tree;
542 567
@@ -558,7 +583,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
558 cb->errors = 0; 583 cb->errors = 0;
559 cb->inode = inode; 584 cb->inode = inode;
560 cb->mirror_num = mirror_num; 585 cb->mirror_num = mirror_num;
561 sums = &cb->sums; 586 sums = cb->sums;
562 587
563 cb->start = em->orig_start; 588 cb->start = em->orig_start;
564 em_len = em->len; 589 em_len = em->len;
@@ -597,7 +622,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
597 /* include any pages we added in add_ra-bio_pages */ 622 /* include any pages we added in add_ra-bio_pages */
598 cb->len = bio->bi_iter.bi_size; 623 cb->len = bio->bi_iter.bi_size;
599 624
600 comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); 625 comp_bio = btrfs_bio_alloc(cur_disk_byte);
626 bio_set_dev(comp_bio, bdev);
601 comp_bio->bi_opf = REQ_OP_READ; 627 comp_bio->bi_opf = REQ_OP_READ;
602 comp_bio->bi_private = cb; 628 comp_bio->bi_private = cb;
603 comp_bio->bi_end_io = end_compressed_bio_read; 629 comp_bio->bi_end_io = end_compressed_bio_read;
@@ -617,6 +643,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
617 page->mapping = NULL; 643 page->mapping = NULL;
618 if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < 644 if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
619 PAGE_SIZE) { 645 PAGE_SIZE) {
646 unsigned int nr_sectors;
647
620 ret = btrfs_bio_wq_end_io(fs_info, comp_bio, 648 ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
621 BTRFS_WQ_ENDIO_DATA); 649 BTRFS_WQ_ENDIO_DATA);
622 BUG_ON(ret); /* -ENOMEM */ 650 BUG_ON(ret); /* -ENOMEM */
@@ -634,8 +662,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
634 sums); 662 sums);
635 BUG_ON(ret); /* -ENOMEM */ 663 BUG_ON(ret); /* -ENOMEM */
636 } 664 }
637 sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, 665
638 fs_info->sectorsize); 666 nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
667 fs_info->sectorsize);
668 sums += csum_size * nr_sectors;
639 669
640 ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); 670 ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
641 if (ret) { 671 if (ret) {
@@ -643,7 +673,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
643 bio_endio(comp_bio); 673 bio_endio(comp_bio);
644 } 674 }
645 675
646 comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); 676 comp_bio = btrfs_bio_alloc(cur_disk_byte);
677 bio_set_dev(comp_bio, bdev);
647 comp_bio->bi_opf = REQ_OP_READ; 678 comp_bio->bi_opf = REQ_OP_READ;
648 comp_bio->bi_private = cb; 679 comp_bio->bi_private = cb;
649 comp_bio->bi_end_io = end_compressed_bio_read; 680 comp_bio->bi_end_io = end_compressed_bio_read;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 9976fe0f7526..2035b8eb1290 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -61,7 +61,7 @@ struct compressed_bio {
61 * the start of a variable length array of checksums only 61 * the start of a variable length array of checksums only
62 * used by reads 62 * used by reads
63 */ 63 */
64 u32 sums; 64 u8 sums[];
65}; 65};
66 66
67static inline unsigned int btrfs_compress_type(unsigned int type_level) 67static inline unsigned int btrfs_compress_type(unsigned int type_level)
@@ -173,6 +173,7 @@ extern const struct btrfs_compress_op btrfs_lzo_compress;
173extern const struct btrfs_compress_op btrfs_zstd_compress; 173extern const struct btrfs_compress_op btrfs_zstd_compress;
174 174
175const char* btrfs_compress_type2str(enum btrfs_compression_type type); 175const char* btrfs_compress_type2str(enum btrfs_compression_type type);
176bool btrfs_compress_is_valid_type(const char *str, size_t len);
176 177
177int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); 178int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
178 179
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0a61dff27f57..299e11e6c554 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -19,6 +19,7 @@
19#include <linux/kobject.h> 19#include <linux/kobject.h>
20#include <trace/events/btrfs.h> 20#include <trace/events/btrfs.h>
21#include <asm/kmap_types.h> 21#include <asm/kmap_types.h>
22#include <asm/unaligned.h>
22#include <linux/pagemap.h> 23#include <linux/pagemap.h>
23#include <linux/btrfs.h> 24#include <linux/btrfs.h>
24#include <linux/btrfs_tree.h> 25#include <linux/btrfs_tree.h>
@@ -31,11 +32,13 @@
31#include "extent_io.h" 32#include "extent_io.h"
32#include "extent_map.h" 33#include "extent_map.h"
33#include "async-thread.h" 34#include "async-thread.h"
35#include "block-rsv.h"
34 36
35struct btrfs_trans_handle; 37struct btrfs_trans_handle;
36struct btrfs_transaction; 38struct btrfs_transaction;
37struct btrfs_pending_snapshot; 39struct btrfs_pending_snapshot;
38struct btrfs_delayed_ref_root; 40struct btrfs_delayed_ref_root;
41struct btrfs_space_info;
39extern struct kmem_cache *btrfs_trans_handle_cachep; 42extern struct kmem_cache *btrfs_trans_handle_cachep;
40extern struct kmem_cache *btrfs_bit_radix_cachep; 43extern struct kmem_cache *btrfs_bit_radix_cachep;
41extern struct kmem_cache *btrfs_path_cachep; 44extern struct kmem_cache *btrfs_path_cachep;
@@ -45,7 +48,16 @@ struct btrfs_ref;
45 48
46#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ 49#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
47 50
48#define BTRFS_MAX_MIRRORS 3 51/*
52 * Maximum number of mirrors that can be available for all profiles counting
53 * the target device of dev-replace as one. During an active device replace
54 * procedure, the target device of the copy operation is a mirror for the
55 * filesystem data as well that can be used to read data in order to repair
56 * read errors on other disks.
57 *
58 * Current value is derived from RAID1 with 2 copies.
59 */
60#define BTRFS_MAX_MIRRORS (2 + 1)
49 61
50#define BTRFS_MAX_LEVEL 8 62#define BTRFS_MAX_LEVEL 8
51 63
@@ -72,6 +84,7 @@ struct btrfs_ref;
72 84
73/* four bytes for CRC32 */ 85/* four bytes for CRC32 */
74static const int btrfs_csum_sizes[] = { 4 }; 86static const int btrfs_csum_sizes[] = { 4 };
87static const char *btrfs_csum_names[] = { "crc32c" };
75 88
76#define BTRFS_EMPTY_DIR_SIZE 0 89#define BTRFS_EMPTY_DIR_SIZE 0
77 90
@@ -99,10 +112,6 @@ static inline u32 count_max_extents(u64 size)
99 return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); 112 return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
100} 113}
101 114
102struct btrfs_mapping_tree {
103 struct extent_map_tree map_tree;
104};
105
106static inline unsigned long btrfs_chunk_item_size(int num_stripes) 115static inline unsigned long btrfs_chunk_item_size(int num_stripes)
107{ 116{
108 BUG_ON(num_stripes == 0); 117 BUG_ON(num_stripes == 0);
@@ -395,115 +404,6 @@ struct raid_kobject {
395 struct list_head list; 404 struct list_head list;
396}; 405};
397 406
398struct btrfs_space_info {
399 spinlock_t lock;
400
401 u64 total_bytes; /* total bytes in the space,
402 this doesn't take mirrors into account */
403 u64 bytes_used; /* total bytes used,
404 this doesn't take mirrors into account */
405 u64 bytes_pinned; /* total bytes pinned, will be freed when the
406 transaction finishes */
407 u64 bytes_reserved; /* total bytes the allocator has reserved for
408 current allocations */
409 u64 bytes_may_use; /* number of bytes that may be used for
410 delalloc/allocations */
411 u64 bytes_readonly; /* total bytes that are read only */
412
413 u64 max_extent_size; /* This will hold the maximum extent size of
414 the space info if we had an ENOSPC in the
415 allocator. */
416
417 unsigned int full:1; /* indicates that we cannot allocate any more
418 chunks for this space */
419 unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
420
421 unsigned int flush:1; /* set if we are trying to make space */
422
423 unsigned int force_alloc; /* set if we need to force a chunk
424 alloc for this space */
425
426 u64 disk_used; /* total bytes used on disk */
427 u64 disk_total; /* total bytes on disk, takes mirrors into
428 account */
429
430 u64 flags;
431
432 /*
433 * bytes_pinned is kept in line with what is actually pinned, as in
434 * we've called update_block_group and dropped the bytes_used counter
435 * and increased the bytes_pinned counter. However this means that
436 * bytes_pinned does not reflect the bytes that will be pinned once the
437 * delayed refs are flushed, so this counter is inc'ed every time we
438 * call btrfs_free_extent so it is a realtime count of what will be
439 * freed once the transaction is committed. It will be zeroed every
440 * time the transaction commits.
441 */
442 struct percpu_counter total_bytes_pinned;
443
444 struct list_head list;
445 /* Protected by the spinlock 'lock'. */
446 struct list_head ro_bgs;
447 struct list_head priority_tickets;
448 struct list_head tickets;
449 /*
450 * tickets_id just indicates the next ticket will be handled, so note
451 * it's not stored per ticket.
452 */
453 u64 tickets_id;
454
455 struct rw_semaphore groups_sem;
456 /* for block groups in our same type */
457 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
458 wait_queue_head_t wait;
459
460 struct kobject kobj;
461 struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
462};
463
464/*
465 * Types of block reserves
466 */
467enum {
468 BTRFS_BLOCK_RSV_GLOBAL,
469 BTRFS_BLOCK_RSV_DELALLOC,
470 BTRFS_BLOCK_RSV_TRANS,
471 BTRFS_BLOCK_RSV_CHUNK,
472 BTRFS_BLOCK_RSV_DELOPS,
473 BTRFS_BLOCK_RSV_DELREFS,
474 BTRFS_BLOCK_RSV_EMPTY,
475 BTRFS_BLOCK_RSV_TEMP,
476};
477
478struct btrfs_block_rsv {
479 u64 size;
480 u64 reserved;
481 struct btrfs_space_info *space_info;
482 spinlock_t lock;
483 unsigned short full;
484 unsigned short type;
485 unsigned short failfast;
486
487 /*
488 * Qgroup equivalent for @size @reserved
489 *
490 * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care
491 * about things like csum size nor how many tree blocks it will need to
492 * reserve.
493 *
494 * Qgroup cares more about net change of the extent usage.
495 *
496 * So for one newly inserted file extent, in worst case it will cause
497 * leaf split and level increase, nodesize for each file extent is
498 * already too much.
499 *
500 * In short, qgroup_size/reserved is the upper limit of possible needed
501 * qgroup metadata reservation.
502 */
503 u64 qgroup_rsv_size;
504 u64 qgroup_rsv_reserved;
505};
506
507/* 407/*
508 * free clusters are used to claim free space in relatively large chunks, 408 * free clusters are used to claim free space in relatively large chunks,
509 * allowing us to do less seeky writes. They are used for all metadata 409 * allowing us to do less seeky writes. They are used for all metadata
@@ -786,11 +686,18 @@ enum {
786 /* 686 /*
787 * Indicate that balance has been set up from the ioctl and is in the 687 * Indicate that balance has been set up from the ioctl and is in the
788 * main phase. The fs_info::balance_ctl is initialized. 688 * main phase. The fs_info::balance_ctl is initialized.
689 * Set and cleared while holding fs_info::balance_mutex.
789 */ 690 */
790 BTRFS_FS_BALANCE_RUNNING, 691 BTRFS_FS_BALANCE_RUNNING,
791 692
792 /* Indicate that the cleaner thread is awake and doing something. */ 693 /* Indicate that the cleaner thread is awake and doing something. */
793 BTRFS_FS_CLEANER_RUNNING, 694 BTRFS_FS_CLEANER_RUNNING,
695
696 /*
697 * The checksumming has an optimized version and is considered fast,
698 * so we don't need to offload checksums to workqueues.
699 */
700 BTRFS_FS_CSUM_IMPL_FAST,
794}; 701};
795 702
796struct btrfs_fs_info { 703struct btrfs_fs_info {
@@ -824,7 +731,7 @@ struct btrfs_fs_info {
824 struct extent_io_tree *pinned_extents; 731 struct extent_io_tree *pinned_extents;
825 732
826 /* logical->physical extent mapping */ 733 /* logical->physical extent mapping */
827 struct btrfs_mapping_tree mapping_tree; 734 struct extent_map_tree mapping_tree;
828 735
829 /* 736 /*
830 * block reservation for extent, checksum, root tree and 737 * block reservation for extent, checksum, root tree and
@@ -1160,6 +1067,14 @@ struct btrfs_fs_info {
1160 spinlock_t swapfile_pins_lock; 1067 spinlock_t swapfile_pins_lock;
1161 struct rb_root swapfile_pins; 1068 struct rb_root swapfile_pins;
1162 1069
1070 struct crypto_shash *csum_shash;
1071
1072 /*
1073 * Number of send operations in progress.
1074 * Updated while holding fs_info::balance_mutex.
1075 */
1076 int send_in_progress;
1077
1163#ifdef CONFIG_BTRFS_FS_REF_VERIFY 1078#ifdef CONFIG_BTRFS_FS_REF_VERIFY
1164 spinlock_t ref_verify_lock; 1079 spinlock_t ref_verify_lock;
1165 struct rb_root block_tree; 1080 struct rb_root block_tree;
@@ -2451,6 +2366,11 @@ static inline int btrfs_super_csum_size(const struct btrfs_super_block *s)
2451 return btrfs_csum_sizes[t]; 2366 return btrfs_csum_sizes[t];
2452} 2367}
2453 2368
2369static inline const char *btrfs_super_csum_name(u16 csum_type)
2370{
2371 /* csum type is validated at mount time */
2372 return btrfs_csum_names[csum_type];
2373}
2454 2374
2455/* 2375/*
2456 * The leaf data grows from end-to-front in the node. 2376 * The leaf data grows from end-to-front in the node.
@@ -2642,6 +2562,16 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
2642 ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ 2562 ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
2643 btrfs_item_offset_nr(leaf, slot))) 2563 btrfs_item_offset_nr(leaf, slot)))
2644 2564
2565static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
2566{
2567 return crc32c(crc, address, length);
2568}
2569
2570static inline void btrfs_crc32c_final(u32 crc, u8 *result)
2571{
2572 put_unaligned_le32(~crc, result);
2573}
2574
2645static inline u64 btrfs_name_hash(const char *name, int len) 2575static inline u64 btrfs_name_hash(const char *name, int len)
2646{ 2576{
2647 return crc32c((u32)~1, name, len); 2577 return crc32c((u32)~1, name, len);
@@ -2656,12 +2586,6 @@ static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
2656 return (u64) crc32c(parent_objectid, name, len); 2586 return (u64) crc32c(parent_objectid, name, len);
2657} 2587}
2658 2588
2659static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2660{
2661 return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
2662 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2663}
2664
2665static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) 2589static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
2666{ 2590{
2667 return mapping_gfp_constraint(mapping, ~__GFP_FS); 2591 return mapping_gfp_constraint(mapping, ~__GFP_FS);
@@ -2698,8 +2622,6 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
2698 return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; 2622 return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
2699} 2623}
2700 2624
2701int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
2702bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
2703void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 2625void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
2704 const u64 start); 2626 const u64 start);
2705void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); 2627void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
@@ -2814,17 +2736,28 @@ enum btrfs_flush_state {
2814 COMMIT_TRANS = 9, 2736 COMMIT_TRANS = 9,
2815}; 2737};
2816 2738
2817int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); 2739/*
2818int btrfs_check_data_free_space(struct inode *inode, 2740 * control flags for do_chunk_alloc's force field
2819 struct extent_changeset **reserved, u64 start, u64 len); 2741 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
2820void btrfs_free_reserved_data_space(struct inode *inode, 2742 * if we really need one.
2821 struct extent_changeset *reserved, u64 start, u64 len); 2743 *
2822void btrfs_delalloc_release_space(struct inode *inode, 2744 * CHUNK_ALLOC_LIMITED means to only try and allocate one
2823 struct extent_changeset *reserved, 2745 * if we have very few chunks already allocated. This is
2824 u64 start, u64 len, bool qgroup_free); 2746 * used as part of the clustering code to help make sure
2825void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 2747 * we have a good pool of storage to cluster in, without
2826 u64 len); 2748 * filling the FS with empty chunks
2827void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); 2749 *
2750 * CHUNK_ALLOC_FORCE means it must try to allocate one
2751 *
2752 */
2753enum btrfs_chunk_alloc_enum {
2754 CHUNK_ALLOC_NO_FORCE,
2755 CHUNK_ALLOC_LIMITED,
2756 CHUNK_ALLOC_FORCE,
2757};
2758
2759int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
2760 enum btrfs_chunk_alloc_enum force);
2828int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 2761int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
2829 struct btrfs_block_rsv *rsv, 2762 struct btrfs_block_rsv *rsv,
2830 int nitems, bool use_global_rsv); 2763 int nitems, bool use_global_rsv);
@@ -2834,41 +2767,6 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
2834 bool qgroup_free); 2767 bool qgroup_free);
2835 2768
2836int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); 2769int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
2837void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
2838 bool qgroup_free);
2839int btrfs_delalloc_reserve_space(struct inode *inode,
2840 struct extent_changeset **reserved, u64 start, u64 len);
2841void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
2842struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
2843 unsigned short type);
2844void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
2845 struct btrfs_block_rsv *rsv,
2846 unsigned short type);
2847void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
2848 struct btrfs_block_rsv *rsv);
2849int btrfs_block_rsv_add(struct btrfs_root *root,
2850 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
2851 enum btrfs_reserve_flush_enum flush);
2852int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
2853int btrfs_block_rsv_refill(struct btrfs_root *root,
2854 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
2855 enum btrfs_reserve_flush_enum flush);
2856int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2857 struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
2858 bool update_size);
2859int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
2860 struct btrfs_block_rsv *dest, u64 num_bytes,
2861 int min_factor);
2862void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
2863 struct btrfs_block_rsv *block_rsv,
2864 u64 num_bytes);
2865void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
2866void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
2867int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
2868 enum btrfs_reserve_flush_enum flush);
2869void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
2870 struct btrfs_block_rsv *src,
2871 u64 num_bytes);
2872int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); 2770int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache);
2873void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); 2771void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
2874void btrfs_put_block_group_cache(struct btrfs_fs_info *info); 2772void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
@@ -3186,7 +3084,8 @@ int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot,
3186struct btrfs_dio_private; 3084struct btrfs_dio_private;
3187int btrfs_del_csums(struct btrfs_trans_handle *trans, 3085int btrfs_del_csums(struct btrfs_trans_handle *trans,
3188 struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); 3086 struct btrfs_fs_info *fs_info, u64 bytenr, u64 len);
3189blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); 3087blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
3088 u8 *dst);
3190blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, 3089blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
3191 u64 logical_offset); 3090 u64 logical_offset);
3192int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 3091int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
@@ -3514,8 +3413,7 @@ __cold
3514static inline void assfail(const char *expr, const char *file, int line) 3413static inline void assfail(const char *expr, const char *file, int line)
3515{ 3414{
3516 if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { 3415 if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
3517 pr_err("assertion failed: %s, file: %s, line: %d\n", 3416 pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
3518 expr, file, line);
3519 BUG(); 3417 BUG();
3520 } 3418 }
3521} 3419}
@@ -3599,10 +3497,11 @@ do { \
3599/* compatibility and incompatibility defines */ 3497/* compatibility and incompatibility defines */
3600 3498
3601#define btrfs_set_fs_incompat(__fs_info, opt) \ 3499#define btrfs_set_fs_incompat(__fs_info, opt) \
3602 __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) 3500 __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
3501 #opt)
3603 3502
3604static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, 3503static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
3605 u64 flag) 3504 u64 flag, const char* name)
3606{ 3505{
3607 struct btrfs_super_block *disk_super; 3506 struct btrfs_super_block *disk_super;
3608 u64 features; 3507 u64 features;
@@ -3615,18 +3514,20 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
3615 if (!(features & flag)) { 3514 if (!(features & flag)) {
3616 features |= flag; 3515 features |= flag;
3617 btrfs_set_super_incompat_flags(disk_super, features); 3516 btrfs_set_super_incompat_flags(disk_super, features);
3618 btrfs_info(fs_info, "setting %llu feature flag", 3517 btrfs_info(fs_info,
3619 flag); 3518 "setting incompat feature flag for %s (0x%llx)",
3519 name, flag);
3620 } 3520 }
3621 spin_unlock(&fs_info->super_lock); 3521 spin_unlock(&fs_info->super_lock);
3622 } 3522 }
3623} 3523}
3624 3524
3625#define btrfs_clear_fs_incompat(__fs_info, opt) \ 3525#define btrfs_clear_fs_incompat(__fs_info, opt) \
3626 __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) 3526 __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
3527 #opt)
3627 3528
3628static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, 3529static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
3629 u64 flag) 3530 u64 flag, const char* name)
3630{ 3531{
3631 struct btrfs_super_block *disk_super; 3532 struct btrfs_super_block *disk_super;
3632 u64 features; 3533 u64 features;
@@ -3639,8 +3540,9 @@ static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
3639 if (features & flag) { 3540 if (features & flag) {
3640 features &= ~flag; 3541 features &= ~flag;
3641 btrfs_set_super_incompat_flags(disk_super, features); 3542 btrfs_set_super_incompat_flags(disk_super, features);
3642 btrfs_info(fs_info, "clearing %llu feature flag", 3543 btrfs_info(fs_info,
3643 flag); 3544 "clearing incompat feature flag for %s (0x%llx)",
3545 name, flag);
3644 } 3546 }
3645 spin_unlock(&fs_info->super_lock); 3547 spin_unlock(&fs_info->super_lock);
3646 } 3548 }
@@ -3657,10 +3559,11 @@ static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
3657} 3559}
3658 3560
3659#define btrfs_set_fs_compat_ro(__fs_info, opt) \ 3561#define btrfs_set_fs_compat_ro(__fs_info, opt) \
3660 __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) 3562 __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
3563 #opt)
3661 3564
3662static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, 3565static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
3663 u64 flag) 3566 u64 flag, const char *name)
3664{ 3567{
3665 struct btrfs_super_block *disk_super; 3568 struct btrfs_super_block *disk_super;
3666 u64 features; 3569 u64 features;
@@ -3673,18 +3576,20 @@ static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
3673 if (!(features & flag)) { 3576 if (!(features & flag)) {
3674 features |= flag; 3577 features |= flag;
3675 btrfs_set_super_compat_ro_flags(disk_super, features); 3578 btrfs_set_super_compat_ro_flags(disk_super, features);
3676 btrfs_info(fs_info, "setting %llu ro feature flag", 3579 btrfs_info(fs_info,
3677 flag); 3580 "setting compat-ro feature flag for %s (0x%llx)",
3581 name, flag);
3678 } 3582 }
3679 spin_unlock(&fs_info->super_lock); 3583 spin_unlock(&fs_info->super_lock);
3680 } 3584 }
3681} 3585}
3682 3586
3683#define btrfs_clear_fs_compat_ro(__fs_info, opt) \ 3587#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
3684 __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) 3588 __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
3589 #opt)
3685 3590
3686static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, 3591static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
3687 u64 flag) 3592 u64 flag, const char *name)
3688{ 3593{
3689 struct btrfs_super_block *disk_super; 3594 struct btrfs_super_block *disk_super;
3690 u64 features; 3595 u64 features;
@@ -3697,8 +3602,9 @@ static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
3697 if (features & flag) { 3602 if (features & flag) {
3698 features &= ~flag; 3603 features &= ~flag;
3699 btrfs_set_super_compat_ro_flags(disk_super, features); 3604 btrfs_set_super_compat_ro_flags(disk_super, features);
3700 btrfs_info(fs_info, "clearing %llu ro feature flag", 3605 btrfs_info(fs_info,
3701 flag); 3606 "clearing compat-ro feature flag for %s (0x%llx)",
3607 name, flag);
3702 } 3608 }
3703 spin_unlock(&fs_info->super_lock); 3609 spin_unlock(&fs_info->super_lock);
3704 } 3610 }
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
new file mode 100644
index 000000000000..17f7c0d38768
--- /dev/null
+++ b/fs/btrfs/delalloc-space.c
@@ -0,0 +1,494 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#include "ctree.h"
4#include "delalloc-space.h"
5#include "block-rsv.h"
6#include "btrfs_inode.h"
7#include "space-info.h"
8#include "transaction.h"
9#include "qgroup.h"
10
11int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
12{
13 struct btrfs_root *root = inode->root;
14 struct btrfs_fs_info *fs_info = root->fs_info;
15 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
16 u64 used;
17 int ret = 0;
18 int need_commit = 2;
19 int have_pinned_space;
20
21 /* Make sure bytes are sectorsize aligned */
22 bytes = ALIGN(bytes, fs_info->sectorsize);
23
24 if (btrfs_is_free_space_inode(inode)) {
25 need_commit = 0;
26 ASSERT(current->journal_info);
27 }
28
29again:
30 /* Make sure we have enough space to handle the data first */
31 spin_lock(&data_sinfo->lock);
32 used = btrfs_space_info_used(data_sinfo, true);
33
34 if (used + bytes > data_sinfo->total_bytes) {
35 struct btrfs_trans_handle *trans;
36
37 /*
38 * If we don't have enough free bytes in this space then we need
39 * to alloc a new chunk.
40 */
41 if (!data_sinfo->full) {
42 u64 alloc_target;
43
44 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
45 spin_unlock(&data_sinfo->lock);
46
47 alloc_target = btrfs_data_alloc_profile(fs_info);
48 /*
49 * It is ugly that we don't call nolock join
50 * transaction for the free space inode case here.
51 * But it is safe because we only do the data space
52 * reservation for the free space cache in the
53 * transaction context, the common join transaction
54 * just increase the counter of the current transaction
55 * handler, doesn't try to acquire the trans_lock of
56 * the fs.
57 */
58 trans = btrfs_join_transaction(root);
59 if (IS_ERR(trans))
60 return PTR_ERR(trans);
61
62 ret = btrfs_chunk_alloc(trans, alloc_target,
63 CHUNK_ALLOC_NO_FORCE);
64 btrfs_end_transaction(trans);
65 if (ret < 0) {
66 if (ret != -ENOSPC)
67 return ret;
68 else {
69 have_pinned_space = 1;
70 goto commit_trans;
71 }
72 }
73
74 goto again;
75 }
76
77 /*
78 * If we don't have enough pinned space to deal with this
79 * allocation, and no removed chunk in current transaction,
80 * don't bother committing the transaction.
81 */
82 have_pinned_space = __percpu_counter_compare(
83 &data_sinfo->total_bytes_pinned,
84 used + bytes - data_sinfo->total_bytes,
85 BTRFS_TOTAL_BYTES_PINNED_BATCH);
86 spin_unlock(&data_sinfo->lock);
87
88 /* Commit the current transaction and try again */
89commit_trans:
90 if (need_commit) {
91 need_commit--;
92
93 if (need_commit > 0) {
94 btrfs_start_delalloc_roots(fs_info, -1);
95 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
96 (u64)-1);
97 }
98
99 trans = btrfs_join_transaction(root);
100 if (IS_ERR(trans))
101 return PTR_ERR(trans);
102 if (have_pinned_space >= 0 ||
103 test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
104 &trans->transaction->flags) ||
105 need_commit > 0) {
106 ret = btrfs_commit_transaction(trans);
107 if (ret)
108 return ret;
109 /*
110 * The cleaner kthread might still be doing iput
111 * operations. Wait for it to finish so that
112 * more space is released. We don't need to
113 * explicitly run the delayed iputs here because
114 * the commit_transaction would have woken up
115 * the cleaner.
116 */
117 ret = btrfs_wait_on_delayed_iputs(fs_info);
118 if (ret)
119 return ret;
120 goto again;
121 } else {
122 btrfs_end_transaction(trans);
123 }
124 }
125
126 trace_btrfs_space_reservation(fs_info,
127 "space_info:enospc",
128 data_sinfo->flags, bytes, 1);
129 return -ENOSPC;
130 }
131 btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
132 trace_btrfs_space_reservation(fs_info, "space_info",
133 data_sinfo->flags, bytes, 1);
134 spin_unlock(&data_sinfo->lock);
135
136 return 0;
137}
138
139int btrfs_check_data_free_space(struct inode *inode,
140 struct extent_changeset **reserved, u64 start, u64 len)
141{
142 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
143 int ret;
144
145 /* align the range */
146 len = round_up(start + len, fs_info->sectorsize) -
147 round_down(start, fs_info->sectorsize);
148 start = round_down(start, fs_info->sectorsize);
149
150 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
151 if (ret < 0)
152 return ret;
153
154 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
155 ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
156 if (ret < 0)
157 btrfs_free_reserved_data_space_noquota(inode, start, len);
158 else
159 ret = 0;
160 return ret;
161}
162
163/*
164 * Called if we need to clear a data reservation for this inode
165 * Normally in a error case.
166 *
167 * This one will *NOT* use accurate qgroup reserved space API, just for case
168 * which we can't sleep and is sure it won't affect qgroup reserved space.
169 * Like clear_bit_hook().
170 */
171void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
172 u64 len)
173{
174 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
175 struct btrfs_space_info *data_sinfo;
176
177 /* Make sure the range is aligned to sectorsize */
178 len = round_up(start + len, fs_info->sectorsize) -
179 round_down(start, fs_info->sectorsize);
180 start = round_down(start, fs_info->sectorsize);
181
182 data_sinfo = fs_info->data_sinfo;
183 spin_lock(&data_sinfo->lock);
184 btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
185 trace_btrfs_space_reservation(fs_info, "space_info",
186 data_sinfo->flags, len, 0);
187 spin_unlock(&data_sinfo->lock);
188}
189
190/*
191 * Called if we need to clear a data reservation for this inode
192 * Normally in a error case.
193 *
194 * This one will handle the per-inode data rsv map for accurate reserved
195 * space framework.
196 */
197void btrfs_free_reserved_data_space(struct inode *inode,
198 struct extent_changeset *reserved, u64 start, u64 len)
199{
200 struct btrfs_root *root = BTRFS_I(inode)->root;
201
202 /* Make sure the range is aligned to sectorsize */
203 len = round_up(start + len, root->fs_info->sectorsize) -
204 round_down(start, root->fs_info->sectorsize);
205 start = round_down(start, root->fs_info->sectorsize);
206
207 btrfs_free_reserved_data_space_noquota(inode, start, len);
208 btrfs_qgroup_free_data(inode, reserved, start, len);
209}
210
211/**
212 * btrfs_inode_rsv_release - release any excessive reservation.
213 * @inode - the inode we need to release from.
214 * @qgroup_free - free or convert qgroup meta.
215 * Unlike normal operation, qgroup meta reservation needs to know if we are
216 * freeing qgroup reservation or just converting it into per-trans. Normally
217 * @qgroup_free is true for error handling, and false for normal release.
218 *
219 * This is the same as btrfs_block_rsv_release, except that it handles the
220 * tracepoint for the reservation.
221 */
222static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
223{
224 struct btrfs_fs_info *fs_info = inode->root->fs_info;
225 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
226 u64 released = 0;
227 u64 qgroup_to_release = 0;
228
229 /*
230 * Since we statically set the block_rsv->size we just want to say we
231 * are releasing 0 bytes, and then we'll just get the reservation over
232 * the size free'd.
233 */
234 released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
235 &qgroup_to_release);
236 if (released > 0)
237 trace_btrfs_space_reservation(fs_info, "delalloc",
238 btrfs_ino(inode), released, 0);
239 if (qgroup_free)
240 btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
241 else
242 btrfs_qgroup_convert_reserved_meta(inode->root,
243 qgroup_to_release);
244}
245
246static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
247 struct btrfs_inode *inode)
248{
249 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
250 u64 reserve_size = 0;
251 u64 qgroup_rsv_size = 0;
252 u64 csum_leaves;
253 unsigned outstanding_extents;
254
255 lockdep_assert_held(&inode->lock);
256 outstanding_extents = inode->outstanding_extents;
257 if (outstanding_extents)
258 reserve_size = btrfs_calc_trans_metadata_size(fs_info,
259 outstanding_extents + 1);
260 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
261 inode->csum_bytes);
262 reserve_size += btrfs_calc_trans_metadata_size(fs_info,
263 csum_leaves);
264 /*
265 * For qgroup rsv, the calculation is very simple:
266 * account one nodesize for each outstanding extent
267 *
268 * This is overestimating in most cases.
269 */
270 qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
271
272 spin_lock(&block_rsv->lock);
273 block_rsv->size = reserve_size;
274 block_rsv->qgroup_rsv_size = qgroup_rsv_size;
275 spin_unlock(&block_rsv->lock);
276}
277
278static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
279 u64 num_bytes, u64 *meta_reserve,
280 u64 *qgroup_reserve)
281{
282 u64 nr_extents = count_max_extents(num_bytes);
283 u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
284
285 /* We add one for the inode update at finish ordered time */
286 *meta_reserve = btrfs_calc_trans_metadata_size(fs_info,
287 nr_extents + csum_leaves + 1);
288 *qgroup_reserve = nr_extents * fs_info->nodesize;
289}
290
291int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
292{
293 struct btrfs_root *root = inode->root;
294 struct btrfs_fs_info *fs_info = root->fs_info;
295 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
296 u64 meta_reserve, qgroup_reserve;
297 unsigned nr_extents;
298 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
299 int ret = 0;
300 bool delalloc_lock = true;
301
302 /*
303 * If we are a free space inode we need to not flush since we will be in
304 * the middle of a transaction commit. We also don't need the delalloc
305 * mutex since we won't race with anybody. We need this mostly to make
306 * lockdep shut its filthy mouth.
307 *
308 * If we have a transaction open (can happen if we call truncate_block
309 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
310 */
311 if (btrfs_is_free_space_inode(inode)) {
312 flush = BTRFS_RESERVE_NO_FLUSH;
313 delalloc_lock = false;
314 } else {
315 if (current->journal_info)
316 flush = BTRFS_RESERVE_FLUSH_LIMIT;
317
318 if (btrfs_transaction_in_commit(fs_info))
319 schedule_timeout(1);
320 }
321
322 if (delalloc_lock)
323 mutex_lock(&inode->delalloc_mutex);
324
325 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
326
327 /*
328 * We always want to do it this way, every other way is wrong and ends
329 * in tears. Pre-reserving the amount we are going to add will always
330 * be the right way, because otherwise if we have enough parallelism we
331 * could end up with thousands of inodes all holding little bits of
332 * reservations they were able to make previously and the only way to
333 * reclaim that space is to ENOSPC out the operations and clear
334 * everything out and try again, which is bad. This way we just
335 * over-reserve slightly, and clean up the mess when we are done.
336 */
337 calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
338 &qgroup_reserve);
339 ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
340 if (ret)
341 goto out_fail;
342 ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
343 if (ret)
344 goto out_qgroup;
345
346 /*
347 * Now we need to update our outstanding extents and csum bytes _first_
348 * and then add the reservation to the block_rsv. This keeps us from
349 * racing with an ordered completion or some such that would think it
350 * needs to free the reservation we just made.
351 */
352 spin_lock(&inode->lock);
353 nr_extents = count_max_extents(num_bytes);
354 btrfs_mod_outstanding_extents(inode, nr_extents);
355 inode->csum_bytes += num_bytes;
356 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
357 spin_unlock(&inode->lock);
358
359 /* Now we can safely add our space to our block rsv */
360 btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false);
361 trace_btrfs_space_reservation(root->fs_info, "delalloc",
362 btrfs_ino(inode), meta_reserve, 1);
363
364 spin_lock(&block_rsv->lock);
365 block_rsv->qgroup_rsv_reserved += qgroup_reserve;
366 spin_unlock(&block_rsv->lock);
367
368 if (delalloc_lock)
369 mutex_unlock(&inode->delalloc_mutex);
370 return 0;
371out_qgroup:
372 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
373out_fail:
374 btrfs_inode_rsv_release(inode, true);
375 if (delalloc_lock)
376 mutex_unlock(&inode->delalloc_mutex);
377 return ret;
378}
379
380/**
381 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
382 * @inode: the inode to release the reservation for.
383 * @num_bytes: the number of bytes we are releasing.
384 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
385 *
386 * This will release the metadata reservation for an inode. This can be called
387 * once we complete IO for a given set of bytes to release their metadata
388 * reservations, or on error for the same reason.
389 */
390void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
391 bool qgroup_free)
392{
393 struct btrfs_fs_info *fs_info = inode->root->fs_info;
394
395 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
396 spin_lock(&inode->lock);
397 inode->csum_bytes -= num_bytes;
398 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
399 spin_unlock(&inode->lock);
400
401 if (btrfs_is_testing(fs_info))
402 return;
403
404 btrfs_inode_rsv_release(inode, qgroup_free);
405}
406
407/**
408 * btrfs_delalloc_release_extents - release our outstanding_extents
409 * @inode: the inode to balance the reservation for.
410 * @num_bytes: the number of bytes we originally reserved with
411 * @qgroup_free: do we need to free qgroup meta reservation or convert them.
412 *
413 * When we reserve space we increase outstanding_extents for the extents we may
414 * add. Once we've set the range as delalloc or created our ordered extents we
415 * have outstanding_extents to track the real usage, so we use this to free our
416 * temporarily tracked outstanding_extents. This _must_ be used in conjunction
417 * with btrfs_delalloc_reserve_metadata.
418 */
419void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
420 bool qgroup_free)
421{
422 struct btrfs_fs_info *fs_info = inode->root->fs_info;
423 unsigned num_extents;
424
425 spin_lock(&inode->lock);
426 num_extents = count_max_extents(num_bytes);
427 btrfs_mod_outstanding_extents(inode, -num_extents);
428 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
429 spin_unlock(&inode->lock);
430
431 if (btrfs_is_testing(fs_info))
432 return;
433
434 btrfs_inode_rsv_release(inode, qgroup_free);
435}
436
437/**
438 * btrfs_delalloc_reserve_space - reserve data and metadata space for
439 * delalloc
440 * @inode: inode we're writing to
441 * @start: start range we are writing to
442 * @len: how long the range we are writing to
443 * @reserved: mandatory parameter, record actually reserved qgroup ranges of
444 * current reservation.
445 *
446 * This will do the following things
447 *
448 * - reserve space in data space info for num bytes
449 * and reserve precious corresponding qgroup space
450 * (Done in check_data_free_space)
451 *
452 * - reserve space for metadata space, based on the number of outstanding
453 * extents and how much csums will be needed
454 * also reserve metadata space in a per root over-reserve method.
455 * - add to the inodes->delalloc_bytes
456 * - add it to the fs_info's delalloc inodes list.
457 * (Above 3 all done in delalloc_reserve_metadata)
458 *
459 * Return 0 for success
460 * Return <0 for error(-ENOSPC or -EQUOT)
461 */
462int btrfs_delalloc_reserve_space(struct inode *inode,
463 struct extent_changeset **reserved, u64 start, u64 len)
464{
465 int ret;
466
467 ret = btrfs_check_data_free_space(inode, reserved, start, len);
468 if (ret < 0)
469 return ret;
470 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
471 if (ret < 0)
472 btrfs_free_reserved_data_space(inode, *reserved, start, len);
473 return ret;
474}
475
476/**
477 * btrfs_delalloc_release_space - release data and metadata space for delalloc
478 * @inode: inode we're releasing space for
479 * @start: start position of the space already reserved
480 * @len: the len of the space already reserved
481 * @release_bytes: the len of the space we consumed or didn't use
482 *
483 * This function will release the metadata space that was not used and will
484 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
485 * list if there are no delalloc bytes left.
486 * Also it will handle the qgroup reserved space.
487 */
488void btrfs_delalloc_release_space(struct inode *inode,
489 struct extent_changeset *reserved,
490 u64 start, u64 len, bool qgroup_free)
491{
492 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
493 btrfs_free_reserved_data_space(inode, reserved, start, len);
494}
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
new file mode 100644
index 000000000000..54466fbd7075
--- /dev/null
+++ b/fs/btrfs/delalloc-space.h
@@ -0,0 +1,23 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2
3#ifndef BTRFS_DELALLOC_SPACE_H
4#define BTRFS_DELALLOC_SPACE_H
5
6struct extent_changeset;
7
8int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
9int btrfs_check_data_free_space(struct inode *inode,
10 struct extent_changeset **reserved, u64 start, u64 len);
11void btrfs_free_reserved_data_space(struct inode *inode,
12 struct extent_changeset *reserved, u64 start, u64 len);
13void btrfs_delalloc_release_space(struct inode *inode,
14 struct extent_changeset *reserved,
15 u64 start, u64 len, bool qgroup_free);
16void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
17 u64 len);
18void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
19 bool qgroup_free);
20int btrfs_delalloc_reserve_space(struct inode *inode,
21 struct extent_changeset **reserved, u64 start, u64 len);
22
23#endif /* BTRFS_DELALLOC_SPACE_H */
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index a73fc23e2961..9a91d1eb0af4 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -10,6 +10,7 @@
10#include "delayed-ref.h" 10#include "delayed-ref.h"
11#include "transaction.h" 11#include "transaction.h"
12#include "qgroup.h" 12#include "qgroup.h"
13#include "space-info.h"
13 14
14struct kmem_cache *btrfs_delayed_ref_head_cachep; 15struct kmem_cache *btrfs_delayed_ref_head_cachep;
15struct kmem_cache *btrfs_delayed_tree_ref_cachep; 16struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -24,6 +25,179 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep;
24 * of hammering updates on the extent allocation tree. 25 * of hammering updates on the extent allocation tree.
25 */ 26 */
26 27
28bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
29{
30 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
31 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
32 bool ret = false;
33 u64 reserved;
34
35 spin_lock(&global_rsv->lock);
36 reserved = global_rsv->reserved;
37 spin_unlock(&global_rsv->lock);
38
39 /*
40 * Since the global reserve is just kind of magic we don't really want
41 * to rely on it to save our bacon, so if our size is more than the
42 * delayed_refs_rsv and the global rsv then it's time to think about
43 * bailing.
44 */
45 spin_lock(&delayed_refs_rsv->lock);
46 reserved += delayed_refs_rsv->reserved;
47 if (delayed_refs_rsv->size >= reserved)
48 ret = true;
49 spin_unlock(&delayed_refs_rsv->lock);
50 return ret;
51}
52
53int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
54{
55 u64 num_entries =
56 atomic_read(&trans->transaction->delayed_refs.num_entries);
57 u64 avg_runtime;
58 u64 val;
59
60 smp_mb();
61 avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
62 val = num_entries * avg_runtime;
63 if (val >= NSEC_PER_SEC)
64 return 1;
65 if (val >= NSEC_PER_SEC / 2)
66 return 2;
67
68 return btrfs_check_space_for_delayed_refs(trans->fs_info);
69}
70
71/**
72 * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
73 * @fs_info - the fs_info for our fs.
74 * @nr - the number of items to drop.
75 *
76 * This drops the delayed ref head's count from the delayed refs rsv and frees
77 * any excess reservation we had.
78 */
79void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
80{
81 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
82 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
83 u64 released = 0;
84
85 released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes,
86 NULL);
87 if (released)
88 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
89 0, released, 0);
90}
91
92/*
93 * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
94 * @trans - the trans that may have generated delayed refs
95 *
96 * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
97 * it'll calculate the additional size and add it to the delayed_refs_rsv.
98 */
99void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
100{
101 struct btrfs_fs_info *fs_info = trans->fs_info;
102 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
103 u64 num_bytes;
104
105 if (!trans->delayed_ref_updates)
106 return;
107
108 num_bytes = btrfs_calc_trans_metadata_size(fs_info,
109 trans->delayed_ref_updates);
110 spin_lock(&delayed_rsv->lock);
111 delayed_rsv->size += num_bytes;
112 delayed_rsv->full = 0;
113 spin_unlock(&delayed_rsv->lock);
114 trans->delayed_ref_updates = 0;
115}
116
117/**
118 * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
119 * @fs_info - the fs info for our fs.
120 * @src - the source block rsv to transfer from.
121 * @num_bytes - the number of bytes to transfer.
122 *
123 * This transfers up to the num_bytes amount from the src rsv to the
124 * delayed_refs_rsv. Any extra bytes are returned to the space info.
125 */
126void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
127 struct btrfs_block_rsv *src,
128 u64 num_bytes)
129{
130 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
131 u64 to_free = 0;
132
133 spin_lock(&src->lock);
134 src->reserved -= num_bytes;
135 src->size -= num_bytes;
136 spin_unlock(&src->lock);
137
138 spin_lock(&delayed_refs_rsv->lock);
139 if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
140 u64 delta = delayed_refs_rsv->size -
141 delayed_refs_rsv->reserved;
142 if (num_bytes > delta) {
143 to_free = num_bytes - delta;
144 num_bytes = delta;
145 }
146 } else {
147 to_free = num_bytes;
148 num_bytes = 0;
149 }
150
151 if (num_bytes)
152 delayed_refs_rsv->reserved += num_bytes;
153 if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
154 delayed_refs_rsv->full = 1;
155 spin_unlock(&delayed_refs_rsv->lock);
156
157 if (num_bytes)
158 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
159 0, num_bytes, 1);
160 if (to_free)
161 btrfs_space_info_add_old_bytes(fs_info,
162 delayed_refs_rsv->space_info, to_free);
163}
164
165/**
166 * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
167 * @fs_info - the fs_info for our fs.
168 * @flush - control how we can flush for this reservation.
169 *
170 * This will refill the delayed block_rsv up to 1 items size worth of space and
171 * will return -ENOSPC if we can't make the reservation.
172 */
173int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
174 enum btrfs_reserve_flush_enum flush)
175{
176 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
177 u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
178 u64 num_bytes = 0;
179 int ret = -ENOSPC;
180
181 spin_lock(&block_rsv->lock);
182 if (block_rsv->reserved < block_rsv->size) {
183 num_bytes = block_rsv->size - block_rsv->reserved;
184 num_bytes = min(num_bytes, limit);
185 }
186 spin_unlock(&block_rsv->lock);
187
188 if (!num_bytes)
189 return 0;
190
191 ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
192 num_bytes, flush);
193 if (ret)
194 return ret;
195 btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
196 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
197 0, num_bytes, 1);
198 return 0;
199}
200
27/* 201/*
28 * compare two delayed tree backrefs with same bytenr and type 202 * compare two delayed tree backrefs with same bytenr and type
29 */ 203 */
@@ -957,13 +1131,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
957} 1131}
958 1132
959/* 1133/*
960 * this does a simple search for the head node for a given extent. 1134 * This does a simple search for the head node for a given extent. Returns the
961 * It must be called with the delayed ref spinlock held, and it returns 1135 * head node if found, or NULL if not.
962 * the head node if any where found, or NULL if not.
963 */ 1136 */
964struct btrfs_delayed_ref_head * 1137struct btrfs_delayed_ref_head *
965btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) 1138btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
966{ 1139{
1140 lockdep_assert_held(&delayed_refs->lock);
1141
967 return find_ref_head(delayed_refs, bytenr, false); 1142 return find_ref_head(delayed_refs, bytenr, false);
968} 1143}
969 1144
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c18f93ea88ed..1c977e6d45dc 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -364,6 +364,16 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
364 364
365int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); 365int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
366 366
367void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
368void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
369int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
370 enum btrfs_reserve_flush_enum flush);
371void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
372 struct btrfs_block_rsv *src,
373 u64 num_bytes);
374int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
375bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
376
367/* 377/*
368 * helper functions to cast a node into its container 378 * helper functions to cast a node into its container
369 */ 379 */
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index ee0989c7e3a9..6b2e9aa83ffa 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -201,7 +201,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
201 return PTR_ERR(bdev); 201 return PTR_ERR(bdev);
202 } 202 }
203 203
204 filemap_write_and_wait(bdev->bd_inode->i_mapping); 204 sync_blockdev(bdev);
205 205
206 devices = &fs_info->fs_devices->devices; 206 devices = &fs_info->fs_devices->devices;
207 list_for_each_entry(device, devices, dev_list) { 207 list_for_each_entry(device, devices, dev_list) {
@@ -237,7 +237,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
237 } 237 }
238 rcu_assign_pointer(device->name, name); 238 rcu_assign_pointer(device->name, name);
239 239
240 mutex_lock(&fs_info->fs_devices->device_list_mutex);
241 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 240 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
242 device->generation = 0; 241 device->generation = 0;
243 device->io_width = fs_info->sectorsize; 242 device->io_width = fs_info->sectorsize;
@@ -256,6 +255,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
256 device->dev_stats_valid = 1; 255 device->dev_stats_valid = 1;
257 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 256 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
258 device->fs_devices = fs_info->fs_devices; 257 device->fs_devices = fs_info->fs_devices;
258
259 mutex_lock(&fs_info->fs_devices->device_list_mutex);
259 list_add(&device->dev_list, &fs_info->fs_devices->devices); 260 list_add(&device->dev_list, &fs_info->fs_devices->devices);
260 fs_info->fs_devices->num_devices++; 261 fs_info->fs_devices->num_devices++;
261 fs_info->fs_devices->open_devices++; 262 fs_info->fs_devices->open_devices++;
@@ -399,7 +400,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
399 int ret; 400 int ret;
400 struct btrfs_device *tgt_device = NULL; 401 struct btrfs_device *tgt_device = NULL;
401 struct btrfs_device *src_device = NULL; 402 struct btrfs_device *src_device = NULL;
402 bool need_unlock;
403 403
404 src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, 404 src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
405 srcdev_name); 405 srcdev_name);
@@ -413,11 +413,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
413 return -ETXTBSY; 413 return -ETXTBSY;
414 } 414 }
415 415
416 ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
417 src_device, &tgt_device);
418 if (ret)
419 return ret;
420
421 /* 416 /*
422 * Here we commit the transaction to make sure commit_total_bytes 417 * Here we commit the transaction to make sure commit_total_bytes
423 * of all the devices are updated. 418 * of all the devices are updated.
@@ -431,7 +426,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
431 return PTR_ERR(trans); 426 return PTR_ERR(trans);
432 } 427 }
433 428
434 need_unlock = true; 429 ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
430 src_device, &tgt_device);
431 if (ret)
432 return ret;
433
435 down_write(&dev_replace->rwsem); 434 down_write(&dev_replace->rwsem);
436 switch (dev_replace->replace_state) { 435 switch (dev_replace->replace_state) {
437 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 436 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -442,11 +441,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
442 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 441 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
443 ASSERT(0); 442 ASSERT(0);
444 ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; 443 ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
444 up_write(&dev_replace->rwsem);
445 goto leave; 445 goto leave;
446 } 446 }
447 447
448 dev_replace->cont_reading_from_srcdev_mode = read_src; 448 dev_replace->cont_reading_from_srcdev_mode = read_src;
449 WARN_ON(!src_device);
450 dev_replace->srcdev = src_device; 449 dev_replace->srcdev = src_device;
451 dev_replace->tgtdev = tgt_device; 450 dev_replace->tgtdev = tgt_device;
452 451
@@ -471,7 +470,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
471 atomic64_set(&dev_replace->num_write_errors, 0); 470 atomic64_set(&dev_replace->num_write_errors, 0);
472 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); 471 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
473 up_write(&dev_replace->rwsem); 472 up_write(&dev_replace->rwsem);
474 need_unlock = false;
475 473
476 ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); 474 ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
477 if (ret) 475 if (ret)
@@ -479,16 +477,16 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
479 477
480 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); 478 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
481 479
482 /* force writing the updated state information to disk */ 480 /* Commit dev_replace state and reserve 1 item for it. */
483 trans = btrfs_start_transaction(root, 0); 481 trans = btrfs_start_transaction(root, 1);
484 if (IS_ERR(trans)) { 482 if (IS_ERR(trans)) {
485 ret = PTR_ERR(trans); 483 ret = PTR_ERR(trans);
486 need_unlock = true;
487 down_write(&dev_replace->rwsem); 484 down_write(&dev_replace->rwsem);
488 dev_replace->replace_state = 485 dev_replace->replace_state =
489 BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; 486 BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
490 dev_replace->srcdev = NULL; 487 dev_replace->srcdev = NULL;
491 dev_replace->tgtdev = NULL; 488 dev_replace->tgtdev = NULL;
489 up_write(&dev_replace->rwsem);
492 goto leave; 490 goto leave;
493 } 491 }
494 492
@@ -510,8 +508,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
510 return ret; 508 return ret;
511 509
512leave: 510leave:
513 if (need_unlock)
514 up_write(&dev_replace->rwsem);
515 btrfs_destroy_dev_replace_tgtdev(tgt_device); 511 btrfs_destroy_dev_replace_tgtdev(tgt_device);
516 return ret; 512 return ret;
517} 513}
@@ -678,7 +674,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
678 btrfs_device_set_disk_total_bytes(tgt_device, 674 btrfs_device_set_disk_total_bytes(tgt_device,
679 src_device->disk_total_bytes); 675 src_device->disk_total_bytes);
680 btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); 676 btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
681 tgt_device->commit_total_bytes = src_device->commit_total_bytes;
682 tgt_device->commit_bytes_used = src_device->bytes_used; 677 tgt_device->commit_bytes_used = src_device->bytes_used;
683 678
684 btrfs_assign_next_active_device(src_device, tgt_device); 679 btrfs_assign_next_active_device(src_device, tgt_device);
@@ -728,7 +723,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
728 struct btrfs_device *srcdev, 723 struct btrfs_device *srcdev,
729 struct btrfs_device *tgtdev) 724 struct btrfs_device *tgtdev)
730{ 725{
731 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 726 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
732 struct extent_map *em; 727 struct extent_map *em;
733 struct map_lookup *map; 728 struct map_lookup *map;
734 u64 start = 0; 729 u64 start = 0;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index deb74a8c191a..41a2bd2e0c56 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -19,6 +19,7 @@
19#include <linux/crc32c.h> 19#include <linux/crc32c.h>
20#include <linux/sched/mm.h> 20#include <linux/sched/mm.h>
21#include <asm/unaligned.h> 21#include <asm/unaligned.h>
22#include <crypto/hash.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "disk-io.h" 24#include "disk-io.h"
24#include "transaction.h" 25#include "transaction.h"
@@ -40,10 +41,6 @@
40#include "tree-checker.h" 41#include "tree-checker.h"
41#include "ref-verify.h" 42#include "ref-verify.h"
42 43
43#ifdef CONFIG_X86
44#include <asm/cpufeature.h>
45#endif
46
47#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ 44#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
48 BTRFS_HEADER_FLAG_RELOC |\ 45 BTRFS_HEADER_FLAG_RELOC |\
49 BTRFS_SUPER_FLAG_ERROR |\ 46 BTRFS_SUPER_FLAG_ERROR |\
@@ -249,16 +246,6 @@ out:
249 return em; 246 return em;
250} 247}
251 248
252u32 btrfs_csum_data(const char *data, u32 seed, size_t len)
253{
254 return crc32c(seed, data, len);
255}
256
257void btrfs_csum_final(u32 crc, u8 *result)
258{
259 put_unaligned_le32(~crc, result);
260}
261
262/* 249/*
263 * Compute the csum of a btree block and store the result to provided buffer. 250 * Compute the csum of a btree block and store the result to provided buffer.
264 * 251 *
@@ -266,6 +253,8 @@ void btrfs_csum_final(u32 crc, u8 *result)
266 */ 253 */
267static int csum_tree_block(struct extent_buffer *buf, u8 *result) 254static int csum_tree_block(struct extent_buffer *buf, u8 *result)
268{ 255{
256 struct btrfs_fs_info *fs_info = buf->fs_info;
257 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
269 unsigned long len; 258 unsigned long len;
270 unsigned long cur_len; 259 unsigned long cur_len;
271 unsigned long offset = BTRFS_CSUM_SIZE; 260 unsigned long offset = BTRFS_CSUM_SIZE;
@@ -273,9 +262,12 @@ static int csum_tree_block(struct extent_buffer *buf, u8 *result)
273 unsigned long map_start; 262 unsigned long map_start;
274 unsigned long map_len; 263 unsigned long map_len;
275 int err; 264 int err;
276 u32 crc = ~(u32)0; 265
266 shash->tfm = fs_info->csum_shash;
267 crypto_shash_init(shash);
277 268
278 len = buf->len - offset; 269 len = buf->len - offset;
270
279 while (len > 0) { 271 while (len > 0) {
280 /* 272 /*
281 * Note: we don't need to check for the err == 1 case here, as 273 * Note: we don't need to check for the err == 1 case here, as
@@ -288,14 +280,13 @@ static int csum_tree_block(struct extent_buffer *buf, u8 *result)
288 if (WARN_ON(err)) 280 if (WARN_ON(err))
289 return err; 281 return err;
290 cur_len = min(len, map_len - (offset - map_start)); 282 cur_len = min(len, map_len - (offset - map_start));
291 crc = btrfs_csum_data(kaddr + offset - map_start, 283 crypto_shash_update(shash, kaddr + offset - map_start, cur_len);
292 crc, cur_len);
293 len -= cur_len; 284 len -= cur_len;
294 offset += cur_len; 285 offset += cur_len;
295 } 286 }
296 memset(result, 0, BTRFS_CSUM_SIZE); 287 memset(result, 0, BTRFS_CSUM_SIZE);
297 288
298 btrfs_csum_final(crc, result); 289 crypto_shash_final(shash, result);
299 290
300 return 0; 291 return 0;
301} 292}
@@ -356,6 +347,16 @@ out:
356 return ret; 347 return ret;
357} 348}
358 349
350static bool btrfs_supported_super_csum(u16 csum_type)
351{
352 switch (csum_type) {
353 case BTRFS_CSUM_TYPE_CRC32:
354 return true;
355 default:
356 return false;
357 }
358}
359
359/* 360/*
360 * Return 0 if the superblock checksum type matches the checksum value of that 361 * Return 0 if the superblock checksum type matches the checksum value of that
361 * algorithm. Pass the raw disk superblock data. 362 * algorithm. Pass the raw disk superblock data.
@@ -365,33 +366,25 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
365{ 366{
366 struct btrfs_super_block *disk_sb = 367 struct btrfs_super_block *disk_sb =
367 (struct btrfs_super_block *)raw_disk_sb; 368 (struct btrfs_super_block *)raw_disk_sb;
368 u16 csum_type = btrfs_super_csum_type(disk_sb); 369 char result[BTRFS_CSUM_SIZE];
369 int ret = 0; 370 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
370 371
371 if (csum_type == BTRFS_CSUM_TYPE_CRC32) { 372 shash->tfm = fs_info->csum_shash;
372 u32 crc = ~(u32)0; 373 crypto_shash_init(shash);
373 char result[sizeof(crc)];
374 374
375 /* 375 /*
376 * The super_block structure does not span the whole 376 * The super_block structure does not span the whole
377 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space 377 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
378 * is filled with zeros and is included in the checksum. 378 * filled with zeros and is included in the checksum.
379 */ 379 */
380 crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, 380 crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
381 crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); 381 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
382 btrfs_csum_final(crc, result); 382 crypto_shash_final(shash, result);
383 383
384 if (memcmp(raw_disk_sb, result, sizeof(result))) 384 if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb)))
385 ret = 1; 385 return 1;
386 }
387 386
388 if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { 387 return 0;
389 btrfs_err(fs_info, "unsupported checksum algorithm %u",
390 csum_type);
391 ret = 1;
392 }
393
394 return ret;
395} 388}
396 389
397int btrfs_verify_level_key(struct extent_buffer *eb, int level, 390int btrfs_verify_level_key(struct extent_buffer *eb, int level,
@@ -873,14 +866,13 @@ static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
873 return btree_csum_one_bio(bio); 866 return btree_csum_one_bio(bio);
874} 867}
875 868
876static int check_async_write(struct btrfs_inode *bi) 869static int check_async_write(struct btrfs_fs_info *fs_info,
870 struct btrfs_inode *bi)
877{ 871{
878 if (atomic_read(&bi->sync_writers)) 872 if (atomic_read(&bi->sync_writers))
879 return 0; 873 return 0;
880#ifdef CONFIG_X86 874 if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
881 if (static_cpu_has(X86_FEATURE_XMM4_2))
882 return 0; 875 return 0;
883#endif
884 return 1; 876 return 1;
885} 877}
886 878
@@ -889,7 +881,7 @@ static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
889 unsigned long bio_flags) 881 unsigned long bio_flags)
890{ 882{
891 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 883 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
892 int async = check_async_write(BTRFS_I(inode)); 884 int async = check_async_write(fs_info, BTRFS_I(inode));
893 blk_status_t ret; 885 blk_status_t ret;
894 886
895 if (bio_op(bio) != REQ_OP_WRITE) { 887 if (bio_op(bio) != REQ_OP_WRITE) {
@@ -2262,6 +2254,29 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
2262 return 0; 2254 return 0;
2263} 2255}
2264 2256
2257static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
2258{
2259 struct crypto_shash *csum_shash;
2260 const char *csum_name = btrfs_super_csum_name(csum_type);
2261
2262 csum_shash = crypto_alloc_shash(csum_name, 0, 0);
2263
2264 if (IS_ERR(csum_shash)) {
2265 btrfs_err(fs_info, "error allocating %s hash for checksum",
2266 csum_name);
2267 return PTR_ERR(csum_shash);
2268 }
2269
2270 fs_info->csum_shash = csum_shash;
2271
2272 return 0;
2273}
2274
2275static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
2276{
2277 crypto_free_shash(fs_info->csum_shash);
2278}
2279
2265static int btrfs_replay_log(struct btrfs_fs_info *fs_info, 2280static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2266 struct btrfs_fs_devices *fs_devices) 2281 struct btrfs_fs_devices *fs_devices)
2267{ 2282{
@@ -2577,7 +2592,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
2577 ret = validate_super(fs_info, sb, -1); 2592 ret = validate_super(fs_info, sb, -1);
2578 if (ret < 0) 2593 if (ret < 0)
2579 goto out; 2594 goto out;
2580 if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) { 2595 if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
2581 ret = -EUCLEAN; 2596 ret = -EUCLEAN;
2582 btrfs_err(fs_info, "invalid csum type, has %u want %u", 2597 btrfs_err(fs_info, "invalid csum type, has %u want %u",
2583 btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); 2598 btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
@@ -2607,6 +2622,7 @@ int open_ctree(struct super_block *sb,
2607 u32 stripesize; 2622 u32 stripesize;
2608 u64 generation; 2623 u64 generation;
2609 u64 features; 2624 u64 features;
2625 u16 csum_type;
2610 struct btrfs_key location; 2626 struct btrfs_key location;
2611 struct buffer_head *bh; 2627 struct buffer_head *bh;
2612 struct btrfs_super_block *disk_super; 2628 struct btrfs_super_block *disk_super;
@@ -2689,7 +2705,7 @@ int open_ctree(struct super_block *sb,
2689 INIT_LIST_HEAD(&fs_info->space_info); 2705 INIT_LIST_HEAD(&fs_info->space_info);
2690 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); 2706 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2691 INIT_LIST_HEAD(&fs_info->unused_bgs); 2707 INIT_LIST_HEAD(&fs_info->unused_bgs);
2692 btrfs_mapping_init(&fs_info->mapping_tree); 2708 extent_map_tree_init(&fs_info->mapping_tree);
2693 btrfs_init_block_rsv(&fs_info->global_block_rsv, 2709 btrfs_init_block_rsv(&fs_info->global_block_rsv,
2694 BTRFS_BLOCK_RSV_GLOBAL); 2710 BTRFS_BLOCK_RSV_GLOBAL);
2695 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); 2711 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
@@ -2793,6 +2809,8 @@ int open_ctree(struct super_block *sb,
2793 spin_lock_init(&fs_info->swapfile_pins_lock); 2809 spin_lock_init(&fs_info->swapfile_pins_lock);
2794 fs_info->swapfile_pins = RB_ROOT; 2810 fs_info->swapfile_pins = RB_ROOT;
2795 2811
2812 fs_info->send_in_progress = 0;
2813
2796 ret = btrfs_alloc_stripe_hash_table(fs_info); 2814 ret = btrfs_alloc_stripe_hash_table(fs_info);
2797 if (ret) { 2815 if (ret) {
2798 err = ret; 2816 err = ret;
@@ -2813,6 +2831,25 @@ int open_ctree(struct super_block *sb,
2813 } 2831 }
2814 2832
2815 /* 2833 /*
2834 * Verify the type first, if that or the the checksum value are
2835 * corrupted, we'll find out
2836 */
2837 csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
2838 if (!btrfs_supported_super_csum(csum_type)) {
2839 btrfs_err(fs_info, "unsupported checksum algorithm: %u",
2840 csum_type);
2841 err = -EINVAL;
2842 brelse(bh);
2843 goto fail_alloc;
2844 }
2845
2846 ret = btrfs_init_csum_hash(fs_info, csum_type);
2847 if (ret) {
2848 err = ret;
2849 goto fail_alloc;
2850 }
2851
2852 /*
2816 * We want to check superblock checksum, the type is stored inside. 2853 * We want to check superblock checksum, the type is stored inside.
2817 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). 2854 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
2818 */ 2855 */
@@ -2820,7 +2857,7 @@ int open_ctree(struct super_block *sb,
2820 btrfs_err(fs_info, "superblock checksum mismatch"); 2857 btrfs_err(fs_info, "superblock checksum mismatch");
2821 err = -EINVAL; 2858 err = -EINVAL;
2822 brelse(bh); 2859 brelse(bh);
2823 goto fail_alloc; 2860 goto fail_csum;
2824 } 2861 }
2825 2862
2826 /* 2863 /*
@@ -2857,11 +2894,11 @@ int open_ctree(struct super_block *sb,
2857 if (ret) { 2894 if (ret) {
2858 btrfs_err(fs_info, "superblock contains fatal errors"); 2895 btrfs_err(fs_info, "superblock contains fatal errors");
2859 err = -EINVAL; 2896 err = -EINVAL;
2860 goto fail_alloc; 2897 goto fail_csum;
2861 } 2898 }
2862 2899
2863 if (!btrfs_super_root(disk_super)) 2900 if (!btrfs_super_root(disk_super))
2864 goto fail_alloc; 2901 goto fail_csum;
2865 2902
2866 /* check FS state, whether FS is broken. */ 2903 /* check FS state, whether FS is broken. */
2867 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) 2904 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
@@ -2883,7 +2920,7 @@ int open_ctree(struct super_block *sb,
2883 ret = btrfs_parse_options(fs_info, options, sb->s_flags); 2920 ret = btrfs_parse_options(fs_info, options, sb->s_flags);
2884 if (ret) { 2921 if (ret) {
2885 err = ret; 2922 err = ret;
2886 goto fail_alloc; 2923 goto fail_csum;
2887 } 2924 }
2888 2925
2889 features = btrfs_super_incompat_flags(disk_super) & 2926 features = btrfs_super_incompat_flags(disk_super) &
@@ -2893,7 +2930,7 @@ int open_ctree(struct super_block *sb,
2893 "cannot mount because of unsupported optional features (%llx)", 2930 "cannot mount because of unsupported optional features (%llx)",
2894 features); 2931 features);
2895 err = -EINVAL; 2932 err = -EINVAL;
2896 goto fail_alloc; 2933 goto fail_csum;
2897 } 2934 }
2898 2935
2899 features = btrfs_super_incompat_flags(disk_super); 2936 features = btrfs_super_incompat_flags(disk_super);
@@ -2937,7 +2974,7 @@ int open_ctree(struct super_block *sb,
2937 btrfs_err(fs_info, 2974 btrfs_err(fs_info,
2938"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", 2975"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
2939 nodesize, sectorsize); 2976 nodesize, sectorsize);
2940 goto fail_alloc; 2977 goto fail_csum;
2941 } 2978 }
2942 2979
2943 /* 2980 /*
@@ -2953,7 +2990,7 @@ int open_ctree(struct super_block *sb,
2953 "cannot mount read-write because of unsupported optional features (%llx)", 2990 "cannot mount read-write because of unsupported optional features (%llx)",
2954 features); 2991 features);
2955 err = -EINVAL; 2992 err = -EINVAL;
2956 goto fail_alloc; 2993 goto fail_csum;
2957 } 2994 }
2958 2995
2959 ret = btrfs_init_workqueues(fs_info, fs_devices); 2996 ret = btrfs_init_workqueues(fs_info, fs_devices);
@@ -3331,6 +3368,8 @@ fail_tree_roots:
3331fail_sb_buffer: 3368fail_sb_buffer:
3332 btrfs_stop_all_workers(fs_info); 3369 btrfs_stop_all_workers(fs_info);
3333 btrfs_free_block_groups(fs_info); 3370 btrfs_free_block_groups(fs_info);
3371fail_csum:
3372 btrfs_free_csum_hash(fs_info);
3334fail_alloc: 3373fail_alloc:
3335fail_iput: 3374fail_iput:
3336 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3375 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -3472,17 +3511,20 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
3472static int write_dev_supers(struct btrfs_device *device, 3511static int write_dev_supers(struct btrfs_device *device,
3473 struct btrfs_super_block *sb, int max_mirrors) 3512 struct btrfs_super_block *sb, int max_mirrors)
3474{ 3513{
3514 struct btrfs_fs_info *fs_info = device->fs_info;
3515 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3475 struct buffer_head *bh; 3516 struct buffer_head *bh;
3476 int i; 3517 int i;
3477 int ret; 3518 int ret;
3478 int errors = 0; 3519 int errors = 0;
3479 u32 crc;
3480 u64 bytenr; 3520 u64 bytenr;
3481 int op_flags; 3521 int op_flags;
3482 3522
3483 if (max_mirrors == 0) 3523 if (max_mirrors == 0)
3484 max_mirrors = BTRFS_SUPER_MIRROR_MAX; 3524 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3485 3525
3526 shash->tfm = fs_info->csum_shash;
3527
3486 for (i = 0; i < max_mirrors; i++) { 3528 for (i = 0; i < max_mirrors; i++) {
3487 bytenr = btrfs_sb_offset(i); 3529 bytenr = btrfs_sb_offset(i);
3488 if (bytenr + BTRFS_SUPER_INFO_SIZE >= 3530 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
@@ -3491,10 +3533,10 @@ static int write_dev_supers(struct btrfs_device *device,
3491 3533
3492 btrfs_set_super_bytenr(sb, bytenr); 3534 btrfs_set_super_bytenr(sb, bytenr);
3493 3535
3494 crc = ~(u32)0; 3536 crypto_shash_init(shash);
3495 crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc, 3537 crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE,
3496 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); 3538 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
3497 btrfs_csum_final(crc, sb->csum); 3539 crypto_shash_final(shash, sb->csum);
3498 3540
3499 /* One reference for us, and we leave it for the caller */ 3541 /* One reference for us, and we leave it for the caller */
3500 bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, 3542 bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
@@ -3709,7 +3751,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
3709 3751
3710 if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || 3752 if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
3711 (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) 3753 (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
3712 min_tolerated = min(min_tolerated, 3754 min_tolerated = min_t(int, min_tolerated,
3713 btrfs_raid_array[BTRFS_RAID_SINGLE]. 3755 btrfs_raid_array[BTRFS_RAID_SINGLE].
3714 tolerated_failures); 3756 tolerated_failures);
3715 3757
@@ -3718,7 +3760,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
3718 continue; 3760 continue;
3719 if (!(flags & btrfs_raid_array[raid_type].bg_flag)) 3761 if (!(flags & btrfs_raid_array[raid_type].bg_flag))
3720 continue; 3762 continue;
3721 min_tolerated = min(min_tolerated, 3763 min_tolerated = min_t(int, min_tolerated,
3722 btrfs_raid_array[raid_type]. 3764 btrfs_raid_array[raid_type].
3723 tolerated_failures); 3765 tolerated_failures);
3724 } 3766 }
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a0161aa1ea0b..e80f7c45a307 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -115,8 +115,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
115 int atomic); 115 int atomic);
116int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, 116int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
117 struct btrfs_key *first_key); 117 struct btrfs_key *first_key);
118u32 btrfs_csum_data(const char *data, u32 seed, size_t len);
119void btrfs_csum_final(u32 crc, u8 *result);
120blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 118blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
121 enum btrfs_wq_endio_type metadata); 119 enum btrfs_wq_endio_type metadata);
122blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 120blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5faf057f6f37..d3b58e388535 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -28,46 +28,12 @@
28#include "sysfs.h" 28#include "sysfs.h"
29#include "qgroup.h" 29#include "qgroup.h"
30#include "ref-verify.h" 30#include "ref-verify.h"
31#include "space-info.h"
32#include "block-rsv.h"
33#include "delalloc-space.h"
31 34
32#undef SCRAMBLE_DELAYED_REFS 35#undef SCRAMBLE_DELAYED_REFS
33 36
34/*
35 * control flags for do_chunk_alloc's force field
36 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
37 * if we really need one.
38 *
39 * CHUNK_ALLOC_LIMITED means to only try and allocate one
40 * if we have very few chunks already allocated. This is
41 * used as part of the clustering code to help make sure
42 * we have a good pool of storage to cluster in, without
43 * filling the FS with empty chunks
44 *
45 * CHUNK_ALLOC_FORCE means it must try to allocate one
46 *
47 */
48enum {
49 CHUNK_ALLOC_NO_FORCE = 0,
50 CHUNK_ALLOC_LIMITED = 1,
51 CHUNK_ALLOC_FORCE = 2,
52};
53
54/*
55 * Declare a helper function to detect underflow of various space info members
56 */
57#define DECLARE_SPACE_INFO_UPDATE(name) \
58static inline void update_##name(struct btrfs_space_info *sinfo, \
59 s64 bytes) \
60{ \
61 if (bytes < 0 && sinfo->name < -bytes) { \
62 WARN_ON(1); \
63 sinfo->name = 0; \
64 return; \
65 } \
66 sinfo->name += bytes; \
67}
68
69DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
70DECLARE_SPACE_INFO_UPDATE(bytes_pinned);
71 37
72static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 38static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
73 struct btrfs_delayed_ref_node *node, u64 parent, 39 struct btrfs_delayed_ref_node *node, u64 parent,
@@ -84,21 +50,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
84static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 50static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
85 struct btrfs_delayed_ref_node *node, 51 struct btrfs_delayed_ref_node *node,
86 struct btrfs_delayed_extent_op *extent_op); 52 struct btrfs_delayed_extent_op *extent_op);
87static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
88 int force);
89static int find_next_key(struct btrfs_path *path, int level, 53static int find_next_key(struct btrfs_path *path, int level,
90 struct btrfs_key *key); 54 struct btrfs_key *key);
91static void dump_space_info(struct btrfs_fs_info *fs_info,
92 struct btrfs_space_info *info, u64 bytes,
93 int dump_block_groups);
94static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
95 u64 num_bytes);
96static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
97 struct btrfs_space_info *space_info,
98 u64 num_bytes);
99static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
100 struct btrfs_space_info *space_info,
101 u64 num_bytes);
102 55
103static noinline int 56static noinline int
104block_group_cache_done(struct btrfs_block_group_cache *cache) 57block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -737,62 +690,39 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
737 return block_group_cache_tree_search(info, bytenr, 1); 690 return block_group_cache_tree_search(info, bytenr, 1);
738} 691}
739 692
740static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 693static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
741 u64 flags)
742{ 694{
743 struct list_head *head = &info->space_info; 695 if (ref->type == BTRFS_REF_METADATA) {
744 struct btrfs_space_info *found; 696 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
745 697 return BTRFS_BLOCK_GROUP_SYSTEM;
746 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 698 else
747 699 return BTRFS_BLOCK_GROUP_METADATA;
748 rcu_read_lock();
749 list_for_each_entry_rcu(found, head, list) {
750 if (found->flags & flags) {
751 rcu_read_unlock();
752 return found;
753 }
754 } 700 }
755 rcu_read_unlock(); 701 return BTRFS_BLOCK_GROUP_DATA;
756 return NULL;
757} 702}
758 703
759static void add_pinned_bytes(struct btrfs_fs_info *fs_info, 704static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
760 struct btrfs_ref *ref, int sign) 705 struct btrfs_ref *ref)
761{ 706{
762 struct btrfs_space_info *space_info; 707 struct btrfs_space_info *space_info;
763 s64 num_bytes; 708 u64 flags = generic_ref_to_space_flags(ref);
764 u64 flags;
765
766 ASSERT(sign == 1 || sign == -1);
767 num_bytes = sign * ref->len;
768 if (ref->type == BTRFS_REF_METADATA) {
769 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
770 flags = BTRFS_BLOCK_GROUP_SYSTEM;
771 else
772 flags = BTRFS_BLOCK_GROUP_METADATA;
773 } else {
774 flags = BTRFS_BLOCK_GROUP_DATA;
775 }
776 709
777 space_info = __find_space_info(fs_info, flags); 710 space_info = btrfs_find_space_info(fs_info, flags);
778 ASSERT(space_info); 711 ASSERT(space_info);
779 percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes, 712 percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
780 BTRFS_TOTAL_BYTES_PINNED_BATCH); 713 BTRFS_TOTAL_BYTES_PINNED_BATCH);
781} 714}
782 715
783/* 716static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
784 * after adding space to the filesystem, we need to clear the full flags 717 struct btrfs_ref *ref)
785 * on all the space infos.
786 */
787void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
788{ 718{
789 struct list_head *head = &info->space_info; 719 struct btrfs_space_info *space_info;
790 struct btrfs_space_info *found; 720 u64 flags = generic_ref_to_space_flags(ref);
791 721
792 rcu_read_lock(); 722 space_info = btrfs_find_space_info(fs_info, flags);
793 list_for_each_entry_rcu(found, head, list) 723 ASSERT(space_info);
794 found->full = 0; 724 percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
795 rcu_read_unlock(); 725 BTRFS_TOTAL_BYTES_PINNED_BATCH);
796} 726}
797 727
798/* simple helper to search for an existing data extent at a given offset */ 728/* simple helper to search for an existing data extent at a given offset */
@@ -1121,11 +1051,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1121 __le64 lenum; 1051 __le64 lenum;
1122 1052
1123 lenum = cpu_to_le64(root_objectid); 1053 lenum = cpu_to_le64(root_objectid);
1124 high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 1054 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1125 lenum = cpu_to_le64(owner); 1055 lenum = cpu_to_le64(owner);
1126 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1056 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1127 lenum = cpu_to_le64(offset); 1057 lenum = cpu_to_le64(offset);
1128 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1058 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1129 1059
1130 return ((u64)high_crc << 31) ^ (u64)low_crc; 1060 return ((u64)high_crc << 31) ^ (u64)low_crc;
1131} 1061}
@@ -2065,7 +1995,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2065 btrfs_ref_tree_mod(fs_info, generic_ref); 1995 btrfs_ref_tree_mod(fs_info, generic_ref);
2066 1996
2067 if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) 1997 if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
2068 add_pinned_bytes(fs_info, generic_ref, -1); 1998 sub_pinned_bytes(fs_info, generic_ref);
2069 1999
2070 return ret; 2000 return ret;
2071} 2001}
@@ -2462,7 +2392,7 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
2462 flags = BTRFS_BLOCK_GROUP_SYSTEM; 2392 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2463 else 2393 else
2464 flags = BTRFS_BLOCK_GROUP_METADATA; 2394 flags = BTRFS_BLOCK_GROUP_METADATA;
2465 space_info = __find_space_info(fs_info, flags); 2395 space_info = btrfs_find_space_info(fs_info, flags);
2466 ASSERT(space_info); 2396 ASSERT(space_info);
2467 percpu_counter_add_batch(&space_info->total_bytes_pinned, 2397 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2468 -head->num_bytes, 2398 -head->num_bytes,
@@ -2824,49 +2754,6 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2824 return num_csums; 2754 return num_csums;
2825} 2755}
2826 2756
2827bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
2828{
2829 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
2830 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2831 bool ret = false;
2832 u64 reserved;
2833
2834 spin_lock(&global_rsv->lock);
2835 reserved = global_rsv->reserved;
2836 spin_unlock(&global_rsv->lock);
2837
2838 /*
2839 * Since the global reserve is just kind of magic we don't really want
2840 * to rely on it to save our bacon, so if our size is more than the
2841 * delayed_refs_rsv and the global rsv then it's time to think about
2842 * bailing.
2843 */
2844 spin_lock(&delayed_refs_rsv->lock);
2845 reserved += delayed_refs_rsv->reserved;
2846 if (delayed_refs_rsv->size >= reserved)
2847 ret = true;
2848 spin_unlock(&delayed_refs_rsv->lock);
2849 return ret;
2850}
2851
2852int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
2853{
2854 u64 num_entries =
2855 atomic_read(&trans->transaction->delayed_refs.num_entries);
2856 u64 avg_runtime;
2857 u64 val;
2858
2859 smp_mb();
2860 avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
2861 val = num_entries * avg_runtime;
2862 if (val >= NSEC_PER_SEC)
2863 return 1;
2864 if (val >= NSEC_PER_SEC / 2)
2865 return 2;
2866
2867 return btrfs_check_space_for_delayed_refs(trans->fs_info);
2868}
2869
2870/* 2757/*
2871 * this starts processing the delayed reference count updates and 2758 * this starts processing the delayed reference count updates and
2872 * extent insertions we have queued up so far. count can be 2759 * extent insertions we have queued up so far. count can be
@@ -3834,93 +3721,6 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3834 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); 3721 wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
3835} 3722}
3836 3723
3837static const char *alloc_name(u64 flags)
3838{
3839 switch (flags) {
3840 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3841 return "mixed";
3842 case BTRFS_BLOCK_GROUP_METADATA:
3843 return "metadata";
3844 case BTRFS_BLOCK_GROUP_DATA:
3845 return "data";
3846 case BTRFS_BLOCK_GROUP_SYSTEM:
3847 return "system";
3848 default:
3849 WARN_ON(1);
3850 return "invalid-combination";
3851 };
3852}
3853
3854static int create_space_info(struct btrfs_fs_info *info, u64 flags)
3855{
3856
3857 struct btrfs_space_info *space_info;
3858 int i;
3859 int ret;
3860
3861 space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
3862 if (!space_info)
3863 return -ENOMEM;
3864
3865 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
3866 GFP_KERNEL);
3867 if (ret) {
3868 kfree(space_info);
3869 return ret;
3870 }
3871
3872 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3873 INIT_LIST_HEAD(&space_info->block_groups[i]);
3874 init_rwsem(&space_info->groups_sem);
3875 spin_lock_init(&space_info->lock);
3876 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3877 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3878 init_waitqueue_head(&space_info->wait);
3879 INIT_LIST_HEAD(&space_info->ro_bgs);
3880 INIT_LIST_HEAD(&space_info->tickets);
3881 INIT_LIST_HEAD(&space_info->priority_tickets);
3882
3883 ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
3884 info->space_info_kobj, "%s",
3885 alloc_name(space_info->flags));
3886 if (ret) {
3887 kobject_put(&space_info->kobj);
3888 return ret;
3889 }
3890
3891 list_add_rcu(&space_info->list, &info->space_info);
3892 if (flags & BTRFS_BLOCK_GROUP_DATA)
3893 info->data_sinfo = space_info;
3894
3895 return ret;
3896}
3897
3898static void update_space_info(struct btrfs_fs_info *info, u64 flags,
3899 u64 total_bytes, u64 bytes_used,
3900 u64 bytes_readonly,
3901 struct btrfs_space_info **space_info)
3902{
3903 struct btrfs_space_info *found;
3904 int factor;
3905
3906 factor = btrfs_bg_type_to_factor(flags);
3907
3908 found = __find_space_info(info, flags);
3909 ASSERT(found);
3910 spin_lock(&found->lock);
3911 found->total_bytes += total_bytes;
3912 found->disk_total += total_bytes * factor;
3913 found->bytes_used += bytes_used;
3914 found->disk_used += bytes_used * factor;
3915 found->bytes_readonly += bytes_readonly;
3916 if (total_bytes > 0)
3917 found->full = 0;
3918 space_info_add_new_bytes(info, found, total_bytes -
3919 bytes_used - bytes_readonly);
3920 spin_unlock(&found->lock);
3921 *space_info = found;
3922}
3923
3924static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3724static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3925{ 3725{
3926 u64 extra_flags = chunk_to_extended(flags) & 3726 u64 extra_flags = chunk_to_extended(flags) &
@@ -4068,215 +3868,6 @@ u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
4068 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3868 return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4069} 3869}
4070 3870
4071static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
4072 bool may_use_included)
4073{
4074 ASSERT(s_info);
4075 return s_info->bytes_used + s_info->bytes_reserved +
4076 s_info->bytes_pinned + s_info->bytes_readonly +
4077 (may_use_included ? s_info->bytes_may_use : 0);
4078}
4079
4080int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
4081{
4082 struct btrfs_root *root = inode->root;
4083 struct btrfs_fs_info *fs_info = root->fs_info;
4084 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
4085 u64 used;
4086 int ret = 0;
4087 int need_commit = 2;
4088 int have_pinned_space;
4089
4090 /* make sure bytes are sectorsize aligned */
4091 bytes = ALIGN(bytes, fs_info->sectorsize);
4092
4093 if (btrfs_is_free_space_inode(inode)) {
4094 need_commit = 0;
4095 ASSERT(current->journal_info);
4096 }
4097
4098again:
4099 /* make sure we have enough space to handle the data first */
4100 spin_lock(&data_sinfo->lock);
4101 used = btrfs_space_info_used(data_sinfo, true);
4102
4103 if (used + bytes > data_sinfo->total_bytes) {
4104 struct btrfs_trans_handle *trans;
4105
4106 /*
4107 * if we don't have enough free bytes in this space then we need
4108 * to alloc a new chunk.
4109 */
4110 if (!data_sinfo->full) {
4111 u64 alloc_target;
4112
4113 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4114 spin_unlock(&data_sinfo->lock);
4115
4116 alloc_target = btrfs_data_alloc_profile(fs_info);
4117 /*
4118 * It is ugly that we don't call nolock join
4119 * transaction for the free space inode case here.
4120 * But it is safe because we only do the data space
4121 * reservation for the free space cache in the
4122 * transaction context, the common join transaction
4123 * just increase the counter of the current transaction
4124 * handler, doesn't try to acquire the trans_lock of
4125 * the fs.
4126 */
4127 trans = btrfs_join_transaction(root);
4128 if (IS_ERR(trans))
4129 return PTR_ERR(trans);
4130
4131 ret = do_chunk_alloc(trans, alloc_target,
4132 CHUNK_ALLOC_NO_FORCE);
4133 btrfs_end_transaction(trans);
4134 if (ret < 0) {
4135 if (ret != -ENOSPC)
4136 return ret;
4137 else {
4138 have_pinned_space = 1;
4139 goto commit_trans;
4140 }
4141 }
4142
4143 goto again;
4144 }
4145
4146 /*
4147 * If we don't have enough pinned space to deal with this
4148 * allocation, and no removed chunk in current transaction,
4149 * don't bother committing the transaction.
4150 */
4151 have_pinned_space = __percpu_counter_compare(
4152 &data_sinfo->total_bytes_pinned,
4153 used + bytes - data_sinfo->total_bytes,
4154 BTRFS_TOTAL_BYTES_PINNED_BATCH);
4155 spin_unlock(&data_sinfo->lock);
4156
4157 /* commit the current transaction and try again */
4158commit_trans:
4159 if (need_commit) {
4160 need_commit--;
4161
4162 if (need_commit > 0) {
4163 btrfs_start_delalloc_roots(fs_info, -1);
4164 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4165 (u64)-1);
4166 }
4167
4168 trans = btrfs_join_transaction(root);
4169 if (IS_ERR(trans))
4170 return PTR_ERR(trans);
4171 if (have_pinned_space >= 0 ||
4172 test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4173 &trans->transaction->flags) ||
4174 need_commit > 0) {
4175 ret = btrfs_commit_transaction(trans);
4176 if (ret)
4177 return ret;
4178 /*
4179 * The cleaner kthread might still be doing iput
4180 * operations. Wait for it to finish so that
4181 * more space is released. We don't need to
4182 * explicitly run the delayed iputs here because
4183 * the commit_transaction would have woken up
4184 * the cleaner.
4185 */
4186 ret = btrfs_wait_on_delayed_iputs(fs_info);
4187 if (ret)
4188 return ret;
4189 goto again;
4190 } else {
4191 btrfs_end_transaction(trans);
4192 }
4193 }
4194
4195 trace_btrfs_space_reservation(fs_info,
4196 "space_info:enospc",
4197 data_sinfo->flags, bytes, 1);
4198 return -ENOSPC;
4199 }
4200 update_bytes_may_use(data_sinfo, bytes);
4201 trace_btrfs_space_reservation(fs_info, "space_info",
4202 data_sinfo->flags, bytes, 1);
4203 spin_unlock(&data_sinfo->lock);
4204
4205 return 0;
4206}
4207
4208int btrfs_check_data_free_space(struct inode *inode,
4209 struct extent_changeset **reserved, u64 start, u64 len)
4210{
4211 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4212 int ret;
4213
4214 /* align the range */
4215 len = round_up(start + len, fs_info->sectorsize) -
4216 round_down(start, fs_info->sectorsize);
4217 start = round_down(start, fs_info->sectorsize);
4218
4219 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4220 if (ret < 0)
4221 return ret;
4222
4223 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4224 ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4225 if (ret < 0)
4226 btrfs_free_reserved_data_space_noquota(inode, start, len);
4227 else
4228 ret = 0;
4229 return ret;
4230}
4231
4232/*
4233 * Called if we need to clear a data reservation for this inode
4234 * Normally in a error case.
4235 *
4236 * This one will *NOT* use accurate qgroup reserved space API, just for case
4237 * which we can't sleep and is sure it won't affect qgroup reserved space.
4238 * Like clear_bit_hook().
4239 */
4240void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4241 u64 len)
4242{
4243 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4244 struct btrfs_space_info *data_sinfo;
4245
4246 /* Make sure the range is aligned to sectorsize */
4247 len = round_up(start + len, fs_info->sectorsize) -
4248 round_down(start, fs_info->sectorsize);
4249 start = round_down(start, fs_info->sectorsize);
4250
4251 data_sinfo = fs_info->data_sinfo;
4252 spin_lock(&data_sinfo->lock);
4253 update_bytes_may_use(data_sinfo, -len);
4254 trace_btrfs_space_reservation(fs_info, "space_info",
4255 data_sinfo->flags, len, 0);
4256 spin_unlock(&data_sinfo->lock);
4257}
4258
4259/*
4260 * Called if we need to clear a data reservation for this inode
4261 * Normally in a error case.
4262 *
4263 * This one will handle the per-inode data rsv map for accurate reserved
4264 * space framework.
4265 */
4266void btrfs_free_reserved_data_space(struct inode *inode,
4267 struct extent_changeset *reserved, u64 start, u64 len)
4268{
4269 struct btrfs_root *root = BTRFS_I(inode)->root;
4270
4271 /* Make sure the range is aligned to sectorsize */
4272 len = round_up(start + len, root->fs_info->sectorsize) -
4273 round_down(start, root->fs_info->sectorsize);
4274 start = round_down(start, root->fs_info->sectorsize);
4275
4276 btrfs_free_reserved_data_space_noquota(inode, start, len);
4277 btrfs_qgroup_free_data(inode, reserved, start, len);
4278}
4279
4280static void force_metadata_allocation(struct btrfs_fs_info *info) 3871static void force_metadata_allocation(struct btrfs_fs_info *info)
4281{ 3872{
4282 struct list_head *head = &info->space_info; 3873 struct list_head *head = &info->space_info;
@@ -4290,11 +3881,6 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
4290 rcu_read_unlock(); 3881 rcu_read_unlock();
4291} 3882}
4292 3883
4293static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4294{
4295 return (global->size << 1);
4296}
4297
4298static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 3884static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4299 struct btrfs_space_info *sinfo, int force) 3885 struct btrfs_space_info *sinfo, int force)
4300{ 3886{
@@ -4325,15 +3911,9 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4325{ 3911{
4326 u64 num_dev; 3912 u64 num_dev;
4327 3913
4328 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 3914 num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
4329 BTRFS_BLOCK_GROUP_RAID0 | 3915 if (!num_dev)
4330 BTRFS_BLOCK_GROUP_RAID5 |
4331 BTRFS_BLOCK_GROUP_RAID6))
4332 num_dev = fs_info->fs_devices->rw_devices; 3916 num_dev = fs_info->fs_devices->rw_devices;
4333 else if (type & BTRFS_BLOCK_GROUP_RAID1)
4334 num_dev = 2;
4335 else
4336 num_dev = 1; /* DUP or single */
4337 3917
4338 return num_dev; 3918 return num_dev;
4339} 3919}
@@ -4358,7 +3938,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4358 */ 3938 */
4359 lockdep_assert_held(&fs_info->chunk_mutex); 3939 lockdep_assert_held(&fs_info->chunk_mutex);
4360 3940
4361 info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3941 info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4362 spin_lock(&info->lock); 3942 spin_lock(&info->lock);
4363 left = info->total_bytes - btrfs_space_info_used(info, true); 3943 left = info->total_bytes - btrfs_space_info_used(info, true);
4364 spin_unlock(&info->lock); 3944 spin_unlock(&info->lock);
@@ -4372,7 +3952,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4372 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 3952 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4373 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 3953 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4374 left, thresh, type); 3954 left, thresh, type);
4375 dump_space_info(fs_info, info, 0, 0); 3955 btrfs_dump_space_info(fs_info, info, 0, 0);
4376 } 3956 }
4377 3957
4378 if (left < thresh) { 3958 if (left < thresh) {
@@ -4405,8 +3985,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4405 * - return 1 if it successfully allocates a chunk, 3985 * - return 1 if it successfully allocates a chunk,
4406 * - return errors including -ENOSPC otherwise. 3986 * - return errors including -ENOSPC otherwise.
4407 */ 3987 */
4408static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, 3988int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4409 int force) 3989 enum btrfs_chunk_alloc_enum force)
4410{ 3990{
4411 struct btrfs_fs_info *fs_info = trans->fs_info; 3991 struct btrfs_fs_info *fs_info = trans->fs_info;
4412 struct btrfs_space_info *space_info; 3992 struct btrfs_space_info *space_info;
@@ -4418,7 +3998,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4418 if (trans->allocating_chunk) 3998 if (trans->allocating_chunk)
4419 return -ENOSPC; 3999 return -ENOSPC;
4420 4000
4421 space_info = __find_space_info(fs_info, flags); 4001 space_info = btrfs_find_space_info(fs_info, flags);
4422 ASSERT(space_info); 4002 ASSERT(space_info);
4423 4003
4424 do { 4004 do {
@@ -4525,1714 +4105,6 @@ out:
4525 return ret; 4105 return ret;
4526} 4106}
4527 4107
4528static int can_overcommit(struct btrfs_fs_info *fs_info,
4529 struct btrfs_space_info *space_info, u64 bytes,
4530 enum btrfs_reserve_flush_enum flush,
4531 bool system_chunk)
4532{
4533 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4534 u64 profile;
4535 u64 space_size;
4536 u64 avail;
4537 u64 used;
4538 int factor;
4539
4540 /* Don't overcommit when in mixed mode. */
4541 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4542 return 0;
4543
4544 if (system_chunk)
4545 profile = btrfs_system_alloc_profile(fs_info);
4546 else
4547 profile = btrfs_metadata_alloc_profile(fs_info);
4548
4549 used = btrfs_space_info_used(space_info, false);
4550
4551 /*
4552 * We only want to allow over committing if we have lots of actual space
4553 * free, but if we don't have enough space to handle the global reserve
4554 * space then we could end up having a real enospc problem when trying
4555 * to allocate a chunk or some other such important allocation.
4556 */
4557 spin_lock(&global_rsv->lock);
4558 space_size = calc_global_rsv_need_space(global_rsv);
4559 spin_unlock(&global_rsv->lock);
4560 if (used + space_size >= space_info->total_bytes)
4561 return 0;
4562
4563 used += space_info->bytes_may_use;
4564
4565 avail = atomic64_read(&fs_info->free_chunk_space);
4566
4567 /*
4568 * If we have dup, raid1 or raid10 then only half of the free
4569 * space is actually usable. For raid56, the space info used
4570 * doesn't include the parity drive, so we don't have to
4571 * change the math
4572 */
4573 factor = btrfs_bg_type_to_factor(profile);
4574 avail = div_u64(avail, factor);
4575
4576 /*
4577 * If we aren't flushing all things, let us overcommit up to
4578 * 1/2th of the space. If we can flush, don't let us overcommit
4579 * too much, let it overcommit up to 1/8 of the space.
4580 */
4581 if (flush == BTRFS_RESERVE_FLUSH_ALL)
4582 avail >>= 3;
4583 else
4584 avail >>= 1;
4585
4586 if (used + bytes < space_info->total_bytes + avail)
4587 return 1;
4588 return 0;
4589}
4590
4591static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4592 unsigned long nr_pages, int nr_items)
4593{
4594 struct super_block *sb = fs_info->sb;
4595
4596 if (down_read_trylock(&sb->s_umount)) {
4597 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4598 up_read(&sb->s_umount);
4599 } else {
4600 /*
4601 * We needn't worry the filesystem going from r/w to r/o though
4602 * we don't acquire ->s_umount mutex, because the filesystem
4603 * should guarantee the delalloc inodes list be empty after
4604 * the filesystem is readonly(all dirty pages are written to
4605 * the disk).
4606 */
4607 btrfs_start_delalloc_roots(fs_info, nr_items);
4608 if (!current->journal_info)
4609 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4610 }
4611}
4612
4613static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4614 u64 to_reclaim)
4615{
4616 u64 bytes;
4617 u64 nr;
4618
4619 bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4620 nr = div64_u64(to_reclaim, bytes);
4621 if (!nr)
4622 nr = 1;
4623 return nr;
4624}
4625
4626#define EXTENT_SIZE_PER_ITEM SZ_256K
4627
4628/*
4629 * shrink metadata reservation for delalloc
4630 */
4631static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4632 u64 orig, bool wait_ordered)
4633{
4634 struct btrfs_space_info *space_info;
4635 struct btrfs_trans_handle *trans;
4636 u64 delalloc_bytes;
4637 u64 dio_bytes;
4638 u64 async_pages;
4639 u64 items;
4640 long time_left;
4641 unsigned long nr_pages;
4642 int loops;
4643
4644 /* Calc the number of the pages we need flush for space reservation */
4645 items = calc_reclaim_items_nr(fs_info, to_reclaim);
4646 to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4647
4648 trans = (struct btrfs_trans_handle *)current->journal_info;
4649 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4650
4651 delalloc_bytes = percpu_counter_sum_positive(
4652 &fs_info->delalloc_bytes);
4653 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
4654 if (delalloc_bytes == 0 && dio_bytes == 0) {
4655 if (trans)
4656 return;
4657 if (wait_ordered)
4658 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4659 return;
4660 }
4661
4662 /*
4663 * If we are doing more ordered than delalloc we need to just wait on
4664 * ordered extents, otherwise we'll waste time trying to flush delalloc
4665 * that likely won't give us the space back we need.
4666 */
4667 if (dio_bytes > delalloc_bytes)
4668 wait_ordered = true;
4669
4670 loops = 0;
4671 while ((delalloc_bytes || dio_bytes) && loops < 3) {
4672 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
4673
4674 /*
4675 * Triggers inode writeback for up to nr_pages. This will invoke
4676 * ->writepages callback and trigger delalloc filling
4677 * (btrfs_run_delalloc_range()).
4678 */
4679 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4680
4681 /*
4682 * We need to wait for the compressed pages to start before
4683 * we continue.
4684 */
4685 async_pages = atomic_read(&fs_info->async_delalloc_pages);
4686 if (!async_pages)
4687 goto skip_async;
4688
4689 /*
4690 * Calculate how many compressed pages we want to be written
4691 * before we continue. I.e if there are more async pages than we
4692 * require wait_event will wait until nr_pages are written.
4693 */
4694 if (async_pages <= nr_pages)
4695 async_pages = 0;
4696 else
4697 async_pages -= nr_pages;
4698
4699 wait_event(fs_info->async_submit_wait,
4700 atomic_read(&fs_info->async_delalloc_pages) <=
4701 (int)async_pages);
4702skip_async:
4703 spin_lock(&space_info->lock);
4704 if (list_empty(&space_info->tickets) &&
4705 list_empty(&space_info->priority_tickets)) {
4706 spin_unlock(&space_info->lock);
4707 break;
4708 }
4709 spin_unlock(&space_info->lock);
4710
4711 loops++;
4712 if (wait_ordered && !trans) {
4713 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4714 } else {
4715 time_left = schedule_timeout_killable(1);
4716 if (time_left)
4717 break;
4718 }
4719 delalloc_bytes = percpu_counter_sum_positive(
4720 &fs_info->delalloc_bytes);
4721 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
4722 }
4723}
4724
4725struct reserve_ticket {
4726 u64 orig_bytes;
4727 u64 bytes;
4728 int error;
4729 struct list_head list;
4730 wait_queue_head_t wait;
4731};
4732
4733/**
4734 * maybe_commit_transaction - possibly commit the transaction if its ok to
4735 * @root - the root we're allocating for
4736 * @bytes - the number of bytes we want to reserve
4737 * @force - force the commit
4738 *
4739 * This will check to make sure that committing the transaction will actually
4740 * get us somewhere and then commit the transaction if it does. Otherwise it
4741 * will return -ENOSPC.
4742 */
4743static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4744 struct btrfs_space_info *space_info)
4745{
4746 struct reserve_ticket *ticket = NULL;
4747 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4748 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
4749 struct btrfs_trans_handle *trans;
4750 u64 bytes_needed;
4751 u64 reclaim_bytes = 0;
4752
4753 trans = (struct btrfs_trans_handle *)current->journal_info;
4754 if (trans)
4755 return -EAGAIN;
4756
4757 spin_lock(&space_info->lock);
4758 if (!list_empty(&space_info->priority_tickets))
4759 ticket = list_first_entry(&space_info->priority_tickets,
4760 struct reserve_ticket, list);
4761 else if (!list_empty(&space_info->tickets))
4762 ticket = list_first_entry(&space_info->tickets,
4763 struct reserve_ticket, list);
4764 bytes_needed = (ticket) ? ticket->bytes : 0;
4765 spin_unlock(&space_info->lock);
4766
4767 if (!bytes_needed)
4768 return 0;
4769
4770 trans = btrfs_join_transaction(fs_info->extent_root);
4771 if (IS_ERR(trans))
4772 return PTR_ERR(trans);
4773
4774 /*
4775 * See if there is enough pinned space to make this reservation, or if
4776 * we have block groups that are going to be freed, allowing us to
4777 * possibly do a chunk allocation the next loop through.
4778 */
4779 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
4780 __percpu_counter_compare(&space_info->total_bytes_pinned,
4781 bytes_needed,
4782 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
4783 goto commit;
4784
4785 /*
4786 * See if there is some space in the delayed insertion reservation for
4787 * this reservation.
4788 */
4789 if (space_info != delayed_rsv->space_info)
4790 goto enospc;
4791
4792 spin_lock(&delayed_rsv->lock);
4793 reclaim_bytes += delayed_rsv->reserved;
4794 spin_unlock(&delayed_rsv->lock);
4795
4796 spin_lock(&delayed_refs_rsv->lock);
4797 reclaim_bytes += delayed_refs_rsv->reserved;
4798 spin_unlock(&delayed_refs_rsv->lock);
4799 if (reclaim_bytes >= bytes_needed)
4800 goto commit;
4801 bytes_needed -= reclaim_bytes;
4802
4803 if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4804 bytes_needed,
4805 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
4806 goto enospc;
4807
4808commit:
4809 return btrfs_commit_transaction(trans);
4810enospc:
4811 btrfs_end_transaction(trans);
4812 return -ENOSPC;
4813}
4814
4815/*
4816 * Try to flush some data based on policy set by @state. This is only advisory
4817 * and may fail for various reasons. The caller is supposed to examine the
4818 * state of @space_info to detect the outcome.
4819 */
4820static void flush_space(struct btrfs_fs_info *fs_info,
4821 struct btrfs_space_info *space_info, u64 num_bytes,
4822 int state)
4823{
4824 struct btrfs_root *root = fs_info->extent_root;
4825 struct btrfs_trans_handle *trans;
4826 int nr;
4827 int ret = 0;
4828
4829 switch (state) {
4830 case FLUSH_DELAYED_ITEMS_NR:
4831 case FLUSH_DELAYED_ITEMS:
4832 if (state == FLUSH_DELAYED_ITEMS_NR)
4833 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
4834 else
4835 nr = -1;
4836
4837 trans = btrfs_join_transaction(root);
4838 if (IS_ERR(trans)) {
4839 ret = PTR_ERR(trans);
4840 break;
4841 }
4842 ret = btrfs_run_delayed_items_nr(trans, nr);
4843 btrfs_end_transaction(trans);
4844 break;
4845 case FLUSH_DELALLOC:
4846 case FLUSH_DELALLOC_WAIT:
4847 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
4848 state == FLUSH_DELALLOC_WAIT);
4849 break;
4850 case FLUSH_DELAYED_REFS_NR:
4851 case FLUSH_DELAYED_REFS:
4852 trans = btrfs_join_transaction(root);
4853 if (IS_ERR(trans)) {
4854 ret = PTR_ERR(trans);
4855 break;
4856 }
4857 if (state == FLUSH_DELAYED_REFS_NR)
4858 nr = calc_reclaim_items_nr(fs_info, num_bytes);
4859 else
4860 nr = 0;
4861 btrfs_run_delayed_refs(trans, nr);
4862 btrfs_end_transaction(trans);
4863 break;
4864 case ALLOC_CHUNK:
4865 case ALLOC_CHUNK_FORCE:
4866 trans = btrfs_join_transaction(root);
4867 if (IS_ERR(trans)) {
4868 ret = PTR_ERR(trans);
4869 break;
4870 }
4871 ret = do_chunk_alloc(trans,
4872 btrfs_metadata_alloc_profile(fs_info),
4873 (state == ALLOC_CHUNK) ?
4874 CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE);
4875 btrfs_end_transaction(trans);
4876 if (ret > 0 || ret == -ENOSPC)
4877 ret = 0;
4878 break;
4879 case COMMIT_TRANS:
4880 /*
4881 * If we have pending delayed iputs then we could free up a
4882 * bunch of pinned space, so make sure we run the iputs before
4883 * we do our pinned bytes check below.
4884 */
4885 btrfs_run_delayed_iputs(fs_info);
4886 btrfs_wait_on_delayed_iputs(fs_info);
4887
4888 ret = may_commit_transaction(fs_info, space_info);
4889 break;
4890 default:
4891 ret = -ENOSPC;
4892 break;
4893 }
4894
4895 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
4896 ret);
4897 return;
4898}
4899
4900static inline u64
4901btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
4902 struct btrfs_space_info *space_info,
4903 bool system_chunk)
4904{
4905 struct reserve_ticket *ticket;
4906 u64 used;
4907 u64 expected;
4908 u64 to_reclaim = 0;
4909
4910 list_for_each_entry(ticket, &space_info->tickets, list)
4911 to_reclaim += ticket->bytes;
4912 list_for_each_entry(ticket, &space_info->priority_tickets, list)
4913 to_reclaim += ticket->bytes;
4914 if (to_reclaim)
4915 return to_reclaim;
4916
4917 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4918 if (can_overcommit(fs_info, space_info, to_reclaim,
4919 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4920 return 0;
4921
4922 used = btrfs_space_info_used(space_info, true);
4923
4924 if (can_overcommit(fs_info, space_info, SZ_1M,
4925 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4926 expected = div_factor_fine(space_info->total_bytes, 95);
4927 else
4928 expected = div_factor_fine(space_info->total_bytes, 90);
4929
4930 if (used > expected)
4931 to_reclaim = used - expected;
4932 else
4933 to_reclaim = 0;
4934 to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4935 space_info->bytes_reserved);
4936 return to_reclaim;
4937}
4938
4939static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
4940 struct btrfs_space_info *space_info,
4941 u64 used, bool system_chunk)
4942{
4943 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4944
4945 /* If we're just plain full then async reclaim just slows us down. */
4946 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4947 return 0;
4948
4949 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4950 system_chunk))
4951 return 0;
4952
4953 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
4954 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4955}
4956
4957static bool wake_all_tickets(struct list_head *head)
4958{
4959 struct reserve_ticket *ticket;
4960
4961 while (!list_empty(head)) {
4962 ticket = list_first_entry(head, struct reserve_ticket, list);
4963 list_del_init(&ticket->list);
4964 ticket->error = -ENOSPC;
4965 wake_up(&ticket->wait);
4966 if (ticket->bytes != ticket->orig_bytes)
4967 return true;
4968 }
4969 return false;
4970}
4971
4972/*
4973 * This is for normal flushers, we can wait all goddamned day if we want to. We
4974 * will loop and continuously try to flush as long as we are making progress.
4975 * We count progress as clearing off tickets each time we have to loop.
4976 */
4977static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4978{
4979 struct btrfs_fs_info *fs_info;
4980 struct btrfs_space_info *space_info;
4981 u64 to_reclaim;
4982 int flush_state;
4983 int commit_cycles = 0;
4984 u64 last_tickets_id;
4985
4986 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4987 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4988
4989 spin_lock(&space_info->lock);
4990 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
4991 false);
4992 if (!to_reclaim) {
4993 space_info->flush = 0;
4994 spin_unlock(&space_info->lock);
4995 return;
4996 }
4997 last_tickets_id = space_info->tickets_id;
4998 spin_unlock(&space_info->lock);
4999
5000 flush_state = FLUSH_DELAYED_ITEMS_NR;
5001 do {
5002 flush_space(fs_info, space_info, to_reclaim, flush_state);
5003 spin_lock(&space_info->lock);
5004 if (list_empty(&space_info->tickets)) {
5005 space_info->flush = 0;
5006 spin_unlock(&space_info->lock);
5007 return;
5008 }
5009 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
5010 space_info,
5011 false);
5012 if (last_tickets_id == space_info->tickets_id) {
5013 flush_state++;
5014 } else {
5015 last_tickets_id = space_info->tickets_id;
5016 flush_state = FLUSH_DELAYED_ITEMS_NR;
5017 if (commit_cycles)
5018 commit_cycles--;
5019 }
5020
5021 /*
5022 * We don't want to force a chunk allocation until we've tried
5023 * pretty hard to reclaim space. Think of the case where we
5024 * freed up a bunch of space and so have a lot of pinned space
5025 * to reclaim. We would rather use that than possibly create a
5026 * underutilized metadata chunk. So if this is our first run
5027 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
5028 * commit the transaction. If nothing has changed the next go
5029 * around then we can force a chunk allocation.
5030 */
5031 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
5032 flush_state++;
5033
5034 if (flush_state > COMMIT_TRANS) {
5035 commit_cycles++;
5036 if (commit_cycles > 2) {
5037 if (wake_all_tickets(&space_info->tickets)) {
5038 flush_state = FLUSH_DELAYED_ITEMS_NR;
5039 commit_cycles--;
5040 } else {
5041 space_info->flush = 0;
5042 }
5043 } else {
5044 flush_state = FLUSH_DELAYED_ITEMS_NR;
5045 }
5046 }
5047 spin_unlock(&space_info->lock);
5048 } while (flush_state <= COMMIT_TRANS);
5049}
5050
5051void btrfs_init_async_reclaim_work(struct work_struct *work)
5052{
5053 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5054}
5055
5056static const enum btrfs_flush_state priority_flush_states[] = {
5057 FLUSH_DELAYED_ITEMS_NR,
5058 FLUSH_DELAYED_ITEMS,
5059 ALLOC_CHUNK,
5060};
5061
5062static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5063 struct btrfs_space_info *space_info,
5064 struct reserve_ticket *ticket)
5065{
5066 u64 to_reclaim;
5067 int flush_state;
5068
5069 spin_lock(&space_info->lock);
5070 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5071 false);
5072 if (!to_reclaim) {
5073 spin_unlock(&space_info->lock);
5074 return;
5075 }
5076 spin_unlock(&space_info->lock);
5077
5078 flush_state = 0;
5079 do {
5080 flush_space(fs_info, space_info, to_reclaim,
5081 priority_flush_states[flush_state]);
5082 flush_state++;
5083 spin_lock(&space_info->lock);
5084 if (ticket->bytes == 0) {
5085 spin_unlock(&space_info->lock);
5086 return;
5087 }
5088 spin_unlock(&space_info->lock);
5089 } while (flush_state < ARRAY_SIZE(priority_flush_states));
5090}
5091
5092static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5093 struct btrfs_space_info *space_info,
5094 struct reserve_ticket *ticket)
5095
5096{
5097 DEFINE_WAIT(wait);
5098 u64 reclaim_bytes = 0;
5099 int ret = 0;
5100
5101 spin_lock(&space_info->lock);
5102 while (ticket->bytes > 0 && ticket->error == 0) {
5103 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5104 if (ret) {
5105 ret = -EINTR;
5106 break;
5107 }
5108 spin_unlock(&space_info->lock);
5109
5110 schedule();
5111
5112 finish_wait(&ticket->wait, &wait);
5113 spin_lock(&space_info->lock);
5114 }
5115 if (!ret)
5116 ret = ticket->error;
5117 if (!list_empty(&ticket->list))
5118 list_del_init(&ticket->list);
5119 if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
5120 reclaim_bytes = ticket->orig_bytes - ticket->bytes;
5121 spin_unlock(&space_info->lock);
5122
5123 if (reclaim_bytes)
5124 space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
5125 return ret;
5126}
5127
5128/**
5129 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5130 * @root - the root we're allocating for
5131 * @space_info - the space info we want to allocate from
5132 * @orig_bytes - the number of bytes we want
5133 * @flush - whether or not we can flush to make our reservation
5134 *
5135 * This will reserve orig_bytes number of bytes from the space info associated
5136 * with the block_rsv. If there is not enough space it will make an attempt to
5137 * flush out space to make room. It will do this by flushing delalloc if
5138 * possible or committing the transaction. If flush is 0 then no attempts to
5139 * regain reservations will be made and this will fail if there is not enough
5140 * space already.
5141 */
5142static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5143 struct btrfs_space_info *space_info,
5144 u64 orig_bytes,
5145 enum btrfs_reserve_flush_enum flush,
5146 bool system_chunk)
5147{
5148 struct reserve_ticket ticket;
5149 u64 used;
5150 u64 reclaim_bytes = 0;
5151 int ret = 0;
5152
5153 ASSERT(orig_bytes);
5154 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5155
5156 spin_lock(&space_info->lock);
5157 ret = -ENOSPC;
5158 used = btrfs_space_info_used(space_info, true);
5159
5160 /*
5161 * If we have enough space then hooray, make our reservation and carry
5162 * on. If not see if we can overcommit, and if we can, hooray carry on.
5163 * If not things get more complicated.
5164 */
5165 if (used + orig_bytes <= space_info->total_bytes) {
5166 update_bytes_may_use(space_info, orig_bytes);
5167 trace_btrfs_space_reservation(fs_info, "space_info",
5168 space_info->flags, orig_bytes, 1);
5169 ret = 0;
5170 } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5171 system_chunk)) {
5172 update_bytes_may_use(space_info, orig_bytes);
5173 trace_btrfs_space_reservation(fs_info, "space_info",
5174 space_info->flags, orig_bytes, 1);
5175 ret = 0;
5176 }
5177
5178 /*
5179 * If we couldn't make a reservation then setup our reservation ticket
5180 * and kick the async worker if it's not already running.
5181 *
5182 * If we are a priority flusher then we just need to add our ticket to
5183 * the list and we will do our own flushing further down.
5184 */
5185 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5186 ticket.orig_bytes = orig_bytes;
5187 ticket.bytes = orig_bytes;
5188 ticket.error = 0;
5189 init_waitqueue_head(&ticket.wait);
5190 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5191 list_add_tail(&ticket.list, &space_info->tickets);
5192 if (!space_info->flush) {
5193 space_info->flush = 1;
5194 trace_btrfs_trigger_flush(fs_info,
5195 space_info->flags,
5196 orig_bytes, flush,
5197 "enospc");
5198 queue_work(system_unbound_wq,
5199 &fs_info->async_reclaim_work);
5200 }
5201 } else {
5202 list_add_tail(&ticket.list,
5203 &space_info->priority_tickets);
5204 }
5205 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5206 used += orig_bytes;
5207 /*
5208 * We will do the space reservation dance during log replay,
5209 * which means we won't have fs_info->fs_root set, so don't do
5210 * the async reclaim as we will panic.
5211 */
5212 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
5213 need_do_async_reclaim(fs_info, space_info,
5214 used, system_chunk) &&
5215 !work_busy(&fs_info->async_reclaim_work)) {
5216 trace_btrfs_trigger_flush(fs_info, space_info->flags,
5217 orig_bytes, flush, "preempt");
5218 queue_work(system_unbound_wq,
5219 &fs_info->async_reclaim_work);
5220 }
5221 }
5222 spin_unlock(&space_info->lock);
5223 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5224 return ret;
5225
5226 if (flush == BTRFS_RESERVE_FLUSH_ALL)
5227 return wait_reserve_ticket(fs_info, space_info, &ticket);
5228
5229 ret = 0;
5230 priority_reclaim_metadata_space(fs_info, space_info, &ticket);
5231 spin_lock(&space_info->lock);
5232 if (ticket.bytes) {
5233 if (ticket.bytes < orig_bytes)
5234 reclaim_bytes = orig_bytes - ticket.bytes;
5235 list_del_init(&ticket.list);
5236 ret = -ENOSPC;
5237 }
5238 spin_unlock(&space_info->lock);
5239
5240 if (reclaim_bytes)
5241 space_info_add_old_bytes(fs_info, space_info, reclaim_bytes);
5242 ASSERT(list_empty(&ticket.list));
5243 return ret;
5244}
5245
5246/**
5247 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5248 * @root - the root we're allocating for
5249 * @block_rsv - the block_rsv we're allocating for
5250 * @orig_bytes - the number of bytes we want
5251 * @flush - whether or not we can flush to make our reservation
5252 *
5253 * This will reserve orig_bytes number of bytes from the space info associated
5254 * with the block_rsv. If there is not enough space it will make an attempt to
5255 * flush out space to make room. It will do this by flushing delalloc if
5256 * possible or committing the transaction. If flush is 0 then no attempts to
5257 * regain reservations will be made and this will fail if there is not enough
5258 * space already.
5259 */
5260static int reserve_metadata_bytes(struct btrfs_root *root,
5261 struct btrfs_block_rsv *block_rsv,
5262 u64 orig_bytes,
5263 enum btrfs_reserve_flush_enum flush)
5264{
5265 struct btrfs_fs_info *fs_info = root->fs_info;
5266 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5267 int ret;
5268 bool system_chunk = (root == fs_info->chunk_root);
5269
5270 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5271 orig_bytes, flush, system_chunk);
5272 if (ret == -ENOSPC &&
5273 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5274 if (block_rsv != global_rsv &&
5275 !block_rsv_use_bytes(global_rsv, orig_bytes))
5276 ret = 0;
5277 }
5278 if (ret == -ENOSPC) {
5279 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5280 block_rsv->space_info->flags,
5281 orig_bytes, 1);
5282
5283 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
5284 dump_space_info(fs_info, block_rsv->space_info,
5285 orig_bytes, 0);
5286 }
5287 return ret;
5288}
5289
5290static struct btrfs_block_rsv *get_block_rsv(
5291 const struct btrfs_trans_handle *trans,
5292 const struct btrfs_root *root)
5293{
5294 struct btrfs_fs_info *fs_info = root->fs_info;
5295 struct btrfs_block_rsv *block_rsv = NULL;
5296
5297 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5298 (root == fs_info->csum_root && trans->adding_csums) ||
5299 (root == fs_info->uuid_root))
5300 block_rsv = trans->block_rsv;
5301
5302 if (!block_rsv)
5303 block_rsv = root->block_rsv;
5304
5305 if (!block_rsv)
5306 block_rsv = &fs_info->empty_block_rsv;
5307
5308 return block_rsv;
5309}
5310
5311static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5312 u64 num_bytes)
5313{
5314 int ret = -ENOSPC;
5315 spin_lock(&block_rsv->lock);
5316 if (block_rsv->reserved >= num_bytes) {
5317 block_rsv->reserved -= num_bytes;
5318 if (block_rsv->reserved < block_rsv->size)
5319 block_rsv->full = 0;
5320 ret = 0;
5321 }
5322 spin_unlock(&block_rsv->lock);
5323 return ret;
5324}
5325
5326static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5327 u64 num_bytes, bool update_size)
5328{
5329 spin_lock(&block_rsv->lock);
5330 block_rsv->reserved += num_bytes;
5331 if (update_size)
5332 block_rsv->size += num_bytes;
5333 else if (block_rsv->reserved >= block_rsv->size)
5334 block_rsv->full = 1;
5335 spin_unlock(&block_rsv->lock);
5336}
5337
5338int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5339 struct btrfs_block_rsv *dest, u64 num_bytes,
5340 int min_factor)
5341{
5342 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5343 u64 min_bytes;
5344
5345 if (global_rsv->space_info != dest->space_info)
5346 return -ENOSPC;
5347
5348 spin_lock(&global_rsv->lock);
5349 min_bytes = div_factor(global_rsv->size, min_factor);
5350 if (global_rsv->reserved < min_bytes + num_bytes) {
5351 spin_unlock(&global_rsv->lock);
5352 return -ENOSPC;
5353 }
5354 global_rsv->reserved -= num_bytes;
5355 if (global_rsv->reserved < global_rsv->size)
5356 global_rsv->full = 0;
5357 spin_unlock(&global_rsv->lock);
5358
5359 block_rsv_add_bytes(dest, num_bytes, true);
5360 return 0;
5361}
5362
5363/**
5364 * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
5365 * @fs_info - the fs info for our fs.
5366 * @src - the source block rsv to transfer from.
5367 * @num_bytes - the number of bytes to transfer.
5368 *
5369 * This transfers up to the num_bytes amount from the src rsv to the
5370 * delayed_refs_rsv. Any extra bytes are returned to the space info.
5371 */
5372void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
5373 struct btrfs_block_rsv *src,
5374 u64 num_bytes)
5375{
5376 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
5377 u64 to_free = 0;
5378
5379 spin_lock(&src->lock);
5380 src->reserved -= num_bytes;
5381 src->size -= num_bytes;
5382 spin_unlock(&src->lock);
5383
5384 spin_lock(&delayed_refs_rsv->lock);
5385 if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
5386 u64 delta = delayed_refs_rsv->size -
5387 delayed_refs_rsv->reserved;
5388 if (num_bytes > delta) {
5389 to_free = num_bytes - delta;
5390 num_bytes = delta;
5391 }
5392 } else {
5393 to_free = num_bytes;
5394 num_bytes = 0;
5395 }
5396
5397 if (num_bytes)
5398 delayed_refs_rsv->reserved += num_bytes;
5399 if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
5400 delayed_refs_rsv->full = 1;
5401 spin_unlock(&delayed_refs_rsv->lock);
5402
5403 if (num_bytes)
5404 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5405 0, num_bytes, 1);
5406 if (to_free)
5407 space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info,
5408 to_free);
5409}
5410
5411/**
5412 * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
5413 * @fs_info - the fs_info for our fs.
5414 * @flush - control how we can flush for this reservation.
5415 *
5416 * This will refill the delayed block_rsv up to 1 items size worth of space and
5417 * will return -ENOSPC if we can't make the reservation.
5418 */
5419int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
5420 enum btrfs_reserve_flush_enum flush)
5421{
5422 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5423 u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
5424 u64 num_bytes = 0;
5425 int ret = -ENOSPC;
5426
5427 spin_lock(&block_rsv->lock);
5428 if (block_rsv->reserved < block_rsv->size) {
5429 num_bytes = block_rsv->size - block_rsv->reserved;
5430 num_bytes = min(num_bytes, limit);
5431 }
5432 spin_unlock(&block_rsv->lock);
5433
5434 if (!num_bytes)
5435 return 0;
5436
5437 ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
5438 num_bytes, flush);
5439 if (ret)
5440 return ret;
5441 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5442 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5443 0, num_bytes, 1);
5444 return 0;
5445}
5446
5447/*
5448 * This is for space we already have accounted in space_info->bytes_may_use, so
5449 * basically when we're returning space from block_rsv's.
5450 */
5451static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5452 struct btrfs_space_info *space_info,
5453 u64 num_bytes)
5454{
5455 struct reserve_ticket *ticket;
5456 struct list_head *head;
5457 u64 used;
5458 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5459 bool check_overcommit = false;
5460
5461 spin_lock(&space_info->lock);
5462 head = &space_info->priority_tickets;
5463
5464 /*
5465 * If we are over our limit then we need to check and see if we can
5466 * overcommit, and if we can't then we just need to free up our space
5467 * and not satisfy any requests.
5468 */
5469 used = btrfs_space_info_used(space_info, true);
5470 if (used - num_bytes >= space_info->total_bytes)
5471 check_overcommit = true;
5472again:
5473 while (!list_empty(head) && num_bytes) {
5474 ticket = list_first_entry(head, struct reserve_ticket,
5475 list);
5476 /*
5477 * We use 0 bytes because this space is already reserved, so
5478 * adding the ticket space would be a double count.
5479 */
5480 if (check_overcommit &&
5481 !can_overcommit(fs_info, space_info, 0, flush, false))
5482 break;
5483 if (num_bytes >= ticket->bytes) {
5484 list_del_init(&ticket->list);
5485 num_bytes -= ticket->bytes;
5486 ticket->bytes = 0;
5487 space_info->tickets_id++;
5488 wake_up(&ticket->wait);
5489 } else {
5490 ticket->bytes -= num_bytes;
5491 num_bytes = 0;
5492 }
5493 }
5494
5495 if (num_bytes && head == &space_info->priority_tickets) {
5496 head = &space_info->tickets;
5497 flush = BTRFS_RESERVE_FLUSH_ALL;
5498 goto again;
5499 }
5500 update_bytes_may_use(space_info, -num_bytes);
5501 trace_btrfs_space_reservation(fs_info, "space_info",
5502 space_info->flags, num_bytes, 0);
5503 spin_unlock(&space_info->lock);
5504}
5505
5506/*
5507 * This is for newly allocated space that isn't accounted in
5508 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
5509 * we use this helper.
5510 */
5511static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5512 struct btrfs_space_info *space_info,
5513 u64 num_bytes)
5514{
5515 struct reserve_ticket *ticket;
5516 struct list_head *head = &space_info->priority_tickets;
5517
5518again:
5519 while (!list_empty(head) && num_bytes) {
5520 ticket = list_first_entry(head, struct reserve_ticket,
5521 list);
5522 if (num_bytes >= ticket->bytes) {
5523 trace_btrfs_space_reservation(fs_info, "space_info",
5524 space_info->flags,
5525 ticket->bytes, 1);
5526 list_del_init(&ticket->list);
5527 num_bytes -= ticket->bytes;
5528 update_bytes_may_use(space_info, ticket->bytes);
5529 ticket->bytes = 0;
5530 space_info->tickets_id++;
5531 wake_up(&ticket->wait);
5532 } else {
5533 trace_btrfs_space_reservation(fs_info, "space_info",
5534 space_info->flags,
5535 num_bytes, 1);
5536 update_bytes_may_use(space_info, num_bytes);
5537 ticket->bytes -= num_bytes;
5538 num_bytes = 0;
5539 }
5540 }
5541
5542 if (num_bytes && head == &space_info->priority_tickets) {
5543 head = &space_info->tickets;
5544 goto again;
5545 }
5546}
5547
5548static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5549 struct btrfs_block_rsv *block_rsv,
5550 struct btrfs_block_rsv *dest, u64 num_bytes,
5551 u64 *qgroup_to_release_ret)
5552{
5553 struct btrfs_space_info *space_info = block_rsv->space_info;
5554 u64 qgroup_to_release = 0;
5555 u64 ret;
5556
5557 spin_lock(&block_rsv->lock);
5558 if (num_bytes == (u64)-1) {
5559 num_bytes = block_rsv->size;
5560 qgroup_to_release = block_rsv->qgroup_rsv_size;
5561 }
5562 block_rsv->size -= num_bytes;
5563 if (block_rsv->reserved >= block_rsv->size) {
5564 num_bytes = block_rsv->reserved - block_rsv->size;
5565 block_rsv->reserved = block_rsv->size;
5566 block_rsv->full = 1;
5567 } else {
5568 num_bytes = 0;
5569 }
5570 if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
5571 qgroup_to_release = block_rsv->qgroup_rsv_reserved -
5572 block_rsv->qgroup_rsv_size;
5573 block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
5574 } else {
5575 qgroup_to_release = 0;
5576 }
5577 spin_unlock(&block_rsv->lock);
5578
5579 ret = num_bytes;
5580 if (num_bytes > 0) {
5581 if (dest) {
5582 spin_lock(&dest->lock);
5583 if (!dest->full) {
5584 u64 bytes_to_add;
5585
5586 bytes_to_add = dest->size - dest->reserved;
5587 bytes_to_add = min(num_bytes, bytes_to_add);
5588 dest->reserved += bytes_to_add;
5589 if (dest->reserved >= dest->size)
5590 dest->full = 1;
5591 num_bytes -= bytes_to_add;
5592 }
5593 spin_unlock(&dest->lock);
5594 }
5595 if (num_bytes)
5596 space_info_add_old_bytes(fs_info, space_info,
5597 num_bytes);
5598 }
5599 if (qgroup_to_release_ret)
5600 *qgroup_to_release_ret = qgroup_to_release;
5601 return ret;
5602}
5603
5604int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5605 struct btrfs_block_rsv *dst, u64 num_bytes,
5606 bool update_size)
5607{
5608 int ret;
5609
5610 ret = block_rsv_use_bytes(src, num_bytes);
5611 if (ret)
5612 return ret;
5613
5614 block_rsv_add_bytes(dst, num_bytes, update_size);
5615 return 0;
5616}
5617
5618void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5619{
5620 memset(rsv, 0, sizeof(*rsv));
5621 spin_lock_init(&rsv->lock);
5622 rsv->type = type;
5623}
5624
5625void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5626 struct btrfs_block_rsv *rsv,
5627 unsigned short type)
5628{
5629 btrfs_init_block_rsv(rsv, type);
5630 rsv->space_info = __find_space_info(fs_info,
5631 BTRFS_BLOCK_GROUP_METADATA);
5632}
5633
5634struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5635 unsigned short type)
5636{
5637 struct btrfs_block_rsv *block_rsv;
5638
5639 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5640 if (!block_rsv)
5641 return NULL;
5642
5643 btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5644 return block_rsv;
5645}
5646
5647void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5648 struct btrfs_block_rsv *rsv)
5649{
5650 if (!rsv)
5651 return;
5652 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5653 kfree(rsv);
5654}
5655
5656int btrfs_block_rsv_add(struct btrfs_root *root,
5657 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5658 enum btrfs_reserve_flush_enum flush)
5659{
5660 int ret;
5661
5662 if (num_bytes == 0)
5663 return 0;
5664
5665 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5666 if (!ret)
5667 block_rsv_add_bytes(block_rsv, num_bytes, true);
5668
5669 return ret;
5670}
5671
5672int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5673{
5674 u64 num_bytes = 0;
5675 int ret = -ENOSPC;
5676
5677 if (!block_rsv)
5678 return 0;
5679
5680 spin_lock(&block_rsv->lock);
5681 num_bytes = div_factor(block_rsv->size, min_factor);
5682 if (block_rsv->reserved >= num_bytes)
5683 ret = 0;
5684 spin_unlock(&block_rsv->lock);
5685
5686 return ret;
5687}
5688
5689int btrfs_block_rsv_refill(struct btrfs_root *root,
5690 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5691 enum btrfs_reserve_flush_enum flush)
5692{
5693 u64 num_bytes = 0;
5694 int ret = -ENOSPC;
5695
5696 if (!block_rsv)
5697 return 0;
5698
5699 spin_lock(&block_rsv->lock);
5700 num_bytes = min_reserved;
5701 if (block_rsv->reserved >= num_bytes)
5702 ret = 0;
5703 else
5704 num_bytes -= block_rsv->reserved;
5705 spin_unlock(&block_rsv->lock);
5706
5707 if (!ret)
5708 return 0;
5709
5710 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5711 if (!ret) {
5712 block_rsv_add_bytes(block_rsv, num_bytes, false);
5713 return 0;
5714 }
5715
5716 return ret;
5717}
5718
5719static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5720 struct btrfs_block_rsv *block_rsv,
5721 u64 num_bytes, u64 *qgroup_to_release)
5722{
5723 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5724 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
5725 struct btrfs_block_rsv *target = delayed_rsv;
5726
5727 if (target->full || target == block_rsv)
5728 target = global_rsv;
5729
5730 if (block_rsv->space_info != target->space_info)
5731 target = NULL;
5732
5733 return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
5734 qgroup_to_release);
5735}
5736
5737void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5738 struct btrfs_block_rsv *block_rsv,
5739 u64 num_bytes)
5740{
5741 __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
5742}
5743
5744/**
5745 * btrfs_inode_rsv_release - release any excessive reservation.
5746 * @inode - the inode we need to release from.
5747 * @qgroup_free - free or convert qgroup meta.
5748 * Unlike normal operation, qgroup meta reservation needs to know if we are
5749 * freeing qgroup reservation or just converting it into per-trans. Normally
5750 * @qgroup_free is true for error handling, and false for normal release.
5751 *
5752 * This is the same as btrfs_block_rsv_release, except that it handles the
5753 * tracepoint for the reservation.
5754 */
5755static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5756{
5757 struct btrfs_fs_info *fs_info = inode->root->fs_info;
5758 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5759 u64 released = 0;
5760 u64 qgroup_to_release = 0;
5761
5762 /*
5763 * Since we statically set the block_rsv->size we just want to say we
5764 * are releasing 0 bytes, and then we'll just get the reservation over
5765 * the size free'd.
5766 */
5767 released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
5768 &qgroup_to_release);
5769 if (released > 0)
5770 trace_btrfs_space_reservation(fs_info, "delalloc",
5771 btrfs_ino(inode), released, 0);
5772 if (qgroup_free)
5773 btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
5774 else
5775 btrfs_qgroup_convert_reserved_meta(inode->root,
5776 qgroup_to_release);
5777}
5778
5779/**
5780 * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
5781 * @fs_info - the fs_info for our fs.
5782 * @nr - the number of items to drop.
5783 *
5784 * This drops the delayed ref head's count from the delayed refs rsv and frees
5785 * any excess reservation we had.
5786 */
5787void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
5788{
5789 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5790 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5791 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
5792 u64 released = 0;
5793
5794 released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
5795 num_bytes, NULL);
5796 if (released)
5797 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5798 0, released, 0);
5799}
5800
5801static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5802{
5803 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5804 struct btrfs_space_info *sinfo = block_rsv->space_info;
5805 u64 num_bytes;
5806
5807 /*
5808 * The global block rsv is based on the size of the extent tree, the
5809 * checksum tree and the root tree. If the fs is empty we want to set
5810 * it to a minimal amount for safety.
5811 */
5812 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5813 btrfs_root_used(&fs_info->csum_root->root_item) +
5814 btrfs_root_used(&fs_info->tree_root->root_item);
5815 num_bytes = max_t(u64, num_bytes, SZ_16M);
5816
5817 spin_lock(&sinfo->lock);
5818 spin_lock(&block_rsv->lock);
5819
5820 block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5821
5822 if (block_rsv->reserved < block_rsv->size) {
5823 num_bytes = btrfs_space_info_used(sinfo, true);
5824 if (sinfo->total_bytes > num_bytes) {
5825 num_bytes = sinfo->total_bytes - num_bytes;
5826 num_bytes = min(num_bytes,
5827 block_rsv->size - block_rsv->reserved);
5828 block_rsv->reserved += num_bytes;
5829 update_bytes_may_use(sinfo, num_bytes);
5830 trace_btrfs_space_reservation(fs_info, "space_info",
5831 sinfo->flags, num_bytes,
5832 1);
5833 }
5834 } else if (block_rsv->reserved > block_rsv->size) {
5835 num_bytes = block_rsv->reserved - block_rsv->size;
5836 update_bytes_may_use(sinfo, -num_bytes);
5837 trace_btrfs_space_reservation(fs_info, "space_info",
5838 sinfo->flags, num_bytes, 0);
5839 block_rsv->reserved = block_rsv->size;
5840 }
5841
5842 if (block_rsv->reserved == block_rsv->size)
5843 block_rsv->full = 1;
5844 else
5845 block_rsv->full = 0;
5846
5847 spin_unlock(&block_rsv->lock);
5848 spin_unlock(&sinfo->lock);
5849}
5850
5851static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5852{
5853 struct btrfs_space_info *space_info;
5854
5855 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5856 fs_info->chunk_block_rsv.space_info = space_info;
5857
5858 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5859 fs_info->global_block_rsv.space_info = space_info;
5860 fs_info->trans_block_rsv.space_info = space_info;
5861 fs_info->empty_block_rsv.space_info = space_info;
5862 fs_info->delayed_block_rsv.space_info = space_info;
5863 fs_info->delayed_refs_rsv.space_info = space_info;
5864
5865 fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
5866 fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
5867 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5868 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5869 if (fs_info->quota_root)
5870 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5871 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5872
5873 update_global_block_rsv(fs_info);
5874}
5875
5876static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5877{
5878 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5879 (u64)-1, NULL);
5880 WARN_ON(fs_info->trans_block_rsv.size > 0);
5881 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5882 WARN_ON(fs_info->chunk_block_rsv.size > 0);
5883 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5884 WARN_ON(fs_info->delayed_block_rsv.size > 0);
5885 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5886 WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
5887 WARN_ON(fs_info->delayed_refs_rsv.size > 0);
5888}
5889
5890/*
5891 * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
5892 * @trans - the trans that may have generated delayed refs
5893 *
5894 * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
5895 * it'll calculate the additional size and add it to the delayed_refs_rsv.
5896 */
5897void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
5898{
5899 struct btrfs_fs_info *fs_info = trans->fs_info;
5900 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
5901 u64 num_bytes;
5902
5903 if (!trans->delayed_ref_updates)
5904 return;
5905
5906 num_bytes = btrfs_calc_trans_metadata_size(fs_info,
5907 trans->delayed_ref_updates);
5908 spin_lock(&delayed_rsv->lock);
5909 delayed_rsv->size += num_bytes;
5910 delayed_rsv->full = 0;
5911 spin_unlock(&delayed_rsv->lock);
5912 trans->delayed_ref_updates = 0;
5913}
5914
5915/*
5916 * To be called after all the new block groups attached to the transaction
5917 * handle have been created (btrfs_create_pending_block_groups()).
5918 */
5919void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5920{
5921 struct btrfs_fs_info *fs_info = trans->fs_info;
5922
5923 if (!trans->chunk_bytes_reserved)
5924 return;
5925
5926 WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5927
5928 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5929 trans->chunk_bytes_reserved, NULL);
5930 trans->chunk_bytes_reserved = 0;
5931}
5932
5933/*
5934 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5935 * root: the root of the parent directory
5936 * rsv: block reservation
5937 * items: the number of items that we need do reservation
5938 * use_global_rsv: allow fallback to the global block reservation
5939 *
5940 * This function is used to reserve the space for snapshot/subvolume
5941 * creation and deletion. Those operations are different with the
5942 * common file/directory operations, they change two fs/file trees
5943 * and root tree, the number of items that the qgroup reserves is
5944 * different with the free space reservation. So we can not use
5945 * the space reservation mechanism in start_transaction().
5946 */
5947int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5948 struct btrfs_block_rsv *rsv, int items,
5949 bool use_global_rsv)
5950{
5951 u64 qgroup_num_bytes = 0;
5952 u64 num_bytes;
5953 int ret;
5954 struct btrfs_fs_info *fs_info = root->fs_info;
5955 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5956
5957 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
5958 /* One for parent inode, two for dir entries */
5959 qgroup_num_bytes = 3 * fs_info->nodesize;
5960 ret = btrfs_qgroup_reserve_meta_prealloc(root,
5961 qgroup_num_bytes, true);
5962 if (ret)
5963 return ret;
5964 }
5965
5966 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
5967 rsv->space_info = __find_space_info(fs_info,
5968 BTRFS_BLOCK_GROUP_METADATA);
5969 ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5970 BTRFS_RESERVE_FLUSH_ALL);
5971
5972 if (ret == -ENOSPC && use_global_rsv)
5973 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
5974
5975 if (ret && qgroup_num_bytes)
5976 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5977
5978 return ret;
5979}
5980
5981void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
5982 struct btrfs_block_rsv *rsv)
5983{
5984 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5985}
5986
5987static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
5988 struct btrfs_inode *inode)
5989{
5990 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5991 u64 reserve_size = 0;
5992 u64 qgroup_rsv_size = 0;
5993 u64 csum_leaves;
5994 unsigned outstanding_extents;
5995
5996 lockdep_assert_held(&inode->lock);
5997 outstanding_extents = inode->outstanding_extents;
5998 if (outstanding_extents)
5999 reserve_size = btrfs_calc_trans_metadata_size(fs_info,
6000 outstanding_extents + 1);
6001 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
6002 inode->csum_bytes);
6003 reserve_size += btrfs_calc_trans_metadata_size(fs_info,
6004 csum_leaves);
6005 /*
6006 * For qgroup rsv, the calculation is very simple:
6007 * account one nodesize for each outstanding extent
6008 *
6009 * This is overestimating in most cases.
6010 */
6011 qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
6012
6013 spin_lock(&block_rsv->lock);
6014 block_rsv->size = reserve_size;
6015 block_rsv->qgroup_rsv_size = qgroup_rsv_size;
6016 spin_unlock(&block_rsv->lock);
6017}
6018
6019static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
6020 u64 num_bytes, u64 *meta_reserve,
6021 u64 *qgroup_reserve)
6022{
6023 u64 nr_extents = count_max_extents(num_bytes);
6024 u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
6025
6026 /* We add one for the inode update at finish ordered time */
6027 *meta_reserve = btrfs_calc_trans_metadata_size(fs_info,
6028 nr_extents + csum_leaves + 1);
6029 *qgroup_reserve = nr_extents * fs_info->nodesize;
6030}
6031
6032int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
6033{
6034 struct btrfs_root *root = inode->root;
6035 struct btrfs_fs_info *fs_info = root->fs_info;
6036 struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
6037 u64 meta_reserve, qgroup_reserve;
6038 unsigned nr_extents;
6039 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
6040 int ret = 0;
6041 bool delalloc_lock = true;
6042
6043 /* If we are a free space inode we need to not flush since we will be in
6044 * the middle of a transaction commit. We also don't need the delalloc
6045 * mutex since we won't race with anybody. We need this mostly to make
6046 * lockdep shut its filthy mouth.
6047 *
6048 * If we have a transaction open (can happen if we call truncate_block
6049 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
6050 */
6051 if (btrfs_is_free_space_inode(inode)) {
6052 flush = BTRFS_RESERVE_NO_FLUSH;
6053 delalloc_lock = false;
6054 } else {
6055 if (current->journal_info)
6056 flush = BTRFS_RESERVE_FLUSH_LIMIT;
6057
6058 if (btrfs_transaction_in_commit(fs_info))
6059 schedule_timeout(1);
6060 }
6061
6062 if (delalloc_lock)
6063 mutex_lock(&inode->delalloc_mutex);
6064
6065 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6066
6067 /*
6068 * We always want to do it this way, every other way is wrong and ends
6069 * in tears. Pre-reserving the amount we are going to add will always
6070 * be the right way, because otherwise if we have enough parallelism we
6071 * could end up with thousands of inodes all holding little bits of
6072 * reservations they were able to make previously and the only way to
6073 * reclaim that space is to ENOSPC out the operations and clear
6074 * everything out and try again, which is bad. This way we just
6075 * over-reserve slightly, and clean up the mess when we are done.
6076 */
6077 calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
6078 &qgroup_reserve);
6079 ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
6080 if (ret)
6081 goto out_fail;
6082 ret = reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
6083 if (ret)
6084 goto out_qgroup;
6085
6086 /*
6087 * Now we need to update our outstanding extents and csum bytes _first_
6088 * and then add the reservation to the block_rsv. This keeps us from
6089 * racing with an ordered completion or some such that would think it
6090 * needs to free the reservation we just made.
6091 */
6092 spin_lock(&inode->lock);
6093 nr_extents = count_max_extents(num_bytes);
6094 btrfs_mod_outstanding_extents(inode, nr_extents);
6095 inode->csum_bytes += num_bytes;
6096 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6097 spin_unlock(&inode->lock);
6098
6099 /* Now we can safely add our space to our block rsv */
6100 block_rsv_add_bytes(block_rsv, meta_reserve, false);
6101 trace_btrfs_space_reservation(root->fs_info, "delalloc",
6102 btrfs_ino(inode), meta_reserve, 1);
6103
6104 spin_lock(&block_rsv->lock);
6105 block_rsv->qgroup_rsv_reserved += qgroup_reserve;
6106 spin_unlock(&block_rsv->lock);
6107
6108 if (delalloc_lock)
6109 mutex_unlock(&inode->delalloc_mutex);
6110 return 0;
6111out_qgroup:
6112 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
6113out_fail:
6114 btrfs_inode_rsv_release(inode, true);
6115 if (delalloc_lock)
6116 mutex_unlock(&inode->delalloc_mutex);
6117 return ret;
6118}
6119
6120/**
6121 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6122 * @inode: the inode to release the reservation for.
6123 * @num_bytes: the number of bytes we are releasing.
6124 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
6125 *
6126 * This will release the metadata reservation for an inode. This can be called
6127 * once we complete IO for a given set of bytes to release their metadata
6128 * reservations, or on error for the same reason.
6129 */
6130void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
6131 bool qgroup_free)
6132{
6133 struct btrfs_fs_info *fs_info = inode->root->fs_info;
6134
6135 num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6136 spin_lock(&inode->lock);
6137 inode->csum_bytes -= num_bytes;
6138 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6139 spin_unlock(&inode->lock);
6140
6141 if (btrfs_is_testing(fs_info))
6142 return;
6143
6144 btrfs_inode_rsv_release(inode, qgroup_free);
6145}
6146
6147/**
6148 * btrfs_delalloc_release_extents - release our outstanding_extents
6149 * @inode: the inode to balance the reservation for.
6150 * @num_bytes: the number of bytes we originally reserved with
6151 * @qgroup_free: do we need to free qgroup meta reservation or convert them.
6152 *
6153 * When we reserve space we increase outstanding_extents for the extents we may
6154 * add. Once we've set the range as delalloc or created our ordered extents we
6155 * have outstanding_extents to track the real usage, so we use this to free our
6156 * temporarily tracked outstanding_extents. This _must_ be used in conjunction
6157 * with btrfs_delalloc_reserve_metadata.
6158 */
6159void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
6160 bool qgroup_free)
6161{
6162 struct btrfs_fs_info *fs_info = inode->root->fs_info;
6163 unsigned num_extents;
6164
6165 spin_lock(&inode->lock);
6166 num_extents = count_max_extents(num_bytes);
6167 btrfs_mod_outstanding_extents(inode, -num_extents);
6168 btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6169 spin_unlock(&inode->lock);
6170
6171 if (btrfs_is_testing(fs_info))
6172 return;
6173
6174 btrfs_inode_rsv_release(inode, qgroup_free);
6175}
6176
6177/**
6178 * btrfs_delalloc_reserve_space - reserve data and metadata space for
6179 * delalloc
6180 * @inode: inode we're writing to
6181 * @start: start range we are writing to
6182 * @len: how long the range we are writing to
6183 * @reserved: mandatory parameter, record actually reserved qgroup ranges of
6184 * current reservation.
6185 *
6186 * This will do the following things
6187 *
6188 * o reserve space in data space info for num bytes
6189 * and reserve precious corresponding qgroup space
6190 * (Done in check_data_free_space)
6191 *
6192 * o reserve space for metadata space, based on the number of outstanding
6193 * extents and how much csums will be needed
6194 * also reserve metadata space in a per root over-reserve method.
6195 * o add to the inodes->delalloc_bytes
6196 * o add it to the fs_info's delalloc inodes list.
6197 * (Above 3 all done in delalloc_reserve_metadata)
6198 *
6199 * Return 0 for success
6200 * Return <0 for error(-ENOSPC or -EQUOT)
6201 */
6202int btrfs_delalloc_reserve_space(struct inode *inode,
6203 struct extent_changeset **reserved, u64 start, u64 len)
6204{
6205 int ret;
6206
6207 ret = btrfs_check_data_free_space(inode, reserved, start, len);
6208 if (ret < 0)
6209 return ret;
6210 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
6211 if (ret < 0)
6212 btrfs_free_reserved_data_space(inode, *reserved, start, len);
6213 return ret;
6214}
6215
6216/**
6217 * btrfs_delalloc_release_space - release data and metadata space for delalloc
6218 * @inode: inode we're releasing space for
6219 * @start: start position of the space already reserved
6220 * @len: the len of the space already reserved
6221 * @release_bytes: the len of the space we consumed or didn't use
6222 *
6223 * This function will release the metadata space that was not used and will
6224 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6225 * list if there are no delalloc bytes left.
6226 * Also it will handle the qgroup reserved space.
6227 */
6228void btrfs_delalloc_release_space(struct inode *inode,
6229 struct extent_changeset *reserved,
6230 u64 start, u64 len, bool qgroup_free)
6231{
6232 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
6233 btrfs_free_reserved_data_space(inode, reserved, start, len);
6234}
6235
6236static int update_block_group(struct btrfs_trans_handle *trans, 4108static int update_block_group(struct btrfs_trans_handle *trans,
6237 u64 bytenr, u64 num_bytes, int alloc) 4109 u64 bytenr, u64 num_bytes, int alloc)
6238{ 4110{
@@ -6296,7 +4168,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
6296 old_val -= num_bytes; 4168 old_val -= num_bytes;
6297 btrfs_set_block_group_used(&cache->item, old_val); 4169 btrfs_set_block_group_used(&cache->item, old_val);
6298 cache->pinned += num_bytes; 4170 cache->pinned += num_bytes;
6299 update_bytes_pinned(cache->space_info, num_bytes); 4171 btrfs_space_info_update_bytes_pinned(info,
4172 cache->space_info, num_bytes);
6300 cache->space_info->bytes_used -= num_bytes; 4173 cache->space_info->bytes_used -= num_bytes;
6301 cache->space_info->disk_used -= num_bytes * factor; 4174 cache->space_info->disk_used -= num_bytes * factor;
6302 spin_unlock(&cache->lock); 4175 spin_unlock(&cache->lock);
@@ -6371,7 +4244,8 @@ static int pin_down_extent(struct btrfs_block_group_cache *cache,
6371 spin_lock(&cache->space_info->lock); 4244 spin_lock(&cache->space_info->lock);
6372 spin_lock(&cache->lock); 4245 spin_lock(&cache->lock);
6373 cache->pinned += num_bytes; 4246 cache->pinned += num_bytes;
6374 update_bytes_pinned(cache->space_info, num_bytes); 4247 btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
4248 num_bytes);
6375 if (reserved) { 4249 if (reserved) {
6376 cache->reserved -= num_bytes; 4250 cache->reserved -= num_bytes;
6377 cache->space_info->bytes_reserved -= num_bytes; 4251 cache->space_info->bytes_reserved -= num_bytes;
@@ -6580,7 +4454,8 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6580 } else { 4454 } else {
6581 cache->reserved += num_bytes; 4455 cache->reserved += num_bytes;
6582 space_info->bytes_reserved += num_bytes; 4456 space_info->bytes_reserved += num_bytes;
6583 update_bytes_may_use(space_info, -ram_bytes); 4457 btrfs_space_info_update_bytes_may_use(cache->fs_info,
4458 space_info, -ram_bytes);
6584 if (delalloc) 4459 if (delalloc)
6585 cache->delalloc_bytes += num_bytes; 4460 cache->delalloc_bytes += num_bytes;
6586 } 4461 }
@@ -6646,7 +4521,7 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6646 4521
6647 up_write(&fs_info->commit_root_sem); 4522 up_write(&fs_info->commit_root_sem);
6648 4523
6649 update_global_block_rsv(fs_info); 4524 btrfs_update_global_block_rsv(fs_info);
6650} 4525}
6651 4526
6652/* 4527/*
@@ -6736,7 +4611,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6736 spin_lock(&space_info->lock); 4611 spin_lock(&space_info->lock);
6737 spin_lock(&cache->lock); 4612 spin_lock(&cache->lock);
6738 cache->pinned -= len; 4613 cache->pinned -= len;
6739 update_bytes_pinned(space_info, -len); 4614 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
6740 4615
6741 trace_btrfs_space_reservation(fs_info, "pinned", 4616 trace_btrfs_space_reservation(fs_info, "pinned",
6742 space_info->flags, len, 0); 4617 space_info->flags, len, 0);
@@ -6757,7 +4632,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6757 to_add = min(len, global_rsv->size - 4632 to_add = min(len, global_rsv->size -
6758 global_rsv->reserved); 4633 global_rsv->reserved);
6759 global_rsv->reserved += to_add; 4634 global_rsv->reserved += to_add;
6760 update_bytes_may_use(space_info, to_add); 4635 btrfs_space_info_update_bytes_may_use(fs_info,
4636 space_info, to_add);
6761 if (global_rsv->reserved >= global_rsv->size) 4637 if (global_rsv->reserved >= global_rsv->size)
6762 global_rsv->full = 1; 4638 global_rsv->full = 1;
6763 trace_btrfs_space_reservation(fs_info, 4639 trace_btrfs_space_reservation(fs_info,
@@ -6769,8 +4645,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6769 spin_unlock(&global_rsv->lock); 4645 spin_unlock(&global_rsv->lock);
6770 /* Add to any tickets we may have */ 4646 /* Add to any tickets we may have */
6771 if (len) 4647 if (len)
6772 space_info_add_new_bytes(fs_info, space_info, 4648 btrfs_space_info_add_new_bytes(fs_info,
6773 len); 4649 space_info, len);
6774 } 4650 }
6775 spin_unlock(&space_info->lock); 4651 spin_unlock(&space_info->lock);
6776 } 4652 }
@@ -7191,7 +5067,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7191 } 5067 }
7192out: 5068out:
7193 if (pin) 5069 if (pin)
7194 add_pinned_bytes(fs_info, &generic_ref, 1); 5070 add_pinned_bytes(fs_info, &generic_ref);
7195 5071
7196 if (last_ref) { 5072 if (last_ref) {
7197 /* 5073 /*
@@ -7239,7 +5115,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
7239 btrfs_ref_tree_mod(fs_info, ref); 5115 btrfs_ref_tree_mod(fs_info, ref);
7240 5116
7241 if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) 5117 if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
7242 add_pinned_bytes(fs_info, ref, 1); 5118 add_pinned_bytes(fs_info, ref);
7243 5119
7244 return ret; 5120 return ret;
7245} 5121}
@@ -7292,10 +5168,10 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7292} 5168}
7293 5169
7294enum btrfs_loop_type { 5170enum btrfs_loop_type {
7295 LOOP_CACHING_NOWAIT = 0, 5171 LOOP_CACHING_NOWAIT,
7296 LOOP_CACHING_WAIT = 1, 5172 LOOP_CACHING_WAIT,
7297 LOOP_ALLOC_CHUNK = 2, 5173 LOOP_ALLOC_CHUNK,
7298 LOOP_NO_EMPTY_SIZE = 3, 5174 LOOP_NO_EMPTY_SIZE,
7299}; 5175};
7300 5176
7301static inline void 5177static inline void
@@ -7661,8 +5537,8 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
7661 return ret; 5537 return ret;
7662 } 5538 }
7663 5539
7664 ret = do_chunk_alloc(trans, ffe_ctl->flags, 5540 ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
7665 CHUNK_ALLOC_FORCE); 5541 CHUNK_ALLOC_FORCE);
7666 5542
7667 /* 5543 /*
7668 * If we can't allocate a new chunk we've already looped 5544 * If we can't allocate a new chunk we've already looped
@@ -7758,7 +5634,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
7758 5634
7759 trace_find_free_extent(fs_info, num_bytes, empty_size, flags); 5635 trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
7760 5636
7761 space_info = __find_space_info(fs_info, flags); 5637 space_info = btrfs_find_space_info(fs_info, flags);
7762 if (!space_info) { 5638 if (!space_info) {
7763 btrfs_err(fs_info, "No space info for %llu", flags); 5639 btrfs_err(fs_info, "No space info for %llu", flags);
7764 return -ENOSPC; 5640 return -ENOSPC;
@@ -7863,9 +5739,8 @@ search:
7863 */ 5739 */
7864 if (!block_group_bits(block_group, flags)) { 5740 if (!block_group_bits(block_group, flags)) {
7865 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5741 u64 extra = BTRFS_BLOCK_GROUP_DUP |
7866 BTRFS_BLOCK_GROUP_RAID1 | 5742 BTRFS_BLOCK_GROUP_RAID1_MASK |
7867 BTRFS_BLOCK_GROUP_RAID5 | 5743 BTRFS_BLOCK_GROUP_RAID56_MASK |
7868 BTRFS_BLOCK_GROUP_RAID6 |
7869 BTRFS_BLOCK_GROUP_RAID10; 5744 BTRFS_BLOCK_GROUP_RAID10;
7870 5745
7871 /* 5746 /*
@@ -7984,60 +5859,6 @@ loop:
7984 return ret; 5859 return ret;
7985} 5860}
7986 5861
7987#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
7988do { \
7989 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
7990 spin_lock(&__rsv->lock); \
7991 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
7992 __rsv->size, __rsv->reserved); \
7993 spin_unlock(&__rsv->lock); \
7994} while (0)
7995
7996static void dump_space_info(struct btrfs_fs_info *fs_info,
7997 struct btrfs_space_info *info, u64 bytes,
7998 int dump_block_groups)
7999{
8000 struct btrfs_block_group_cache *cache;
8001 int index = 0;
8002
8003 spin_lock(&info->lock);
8004 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
8005 info->flags,
8006 info->total_bytes - btrfs_space_info_used(info, true),
8007 info->full ? "" : "not ");
8008 btrfs_info(fs_info,
8009 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
8010 info->total_bytes, info->bytes_used, info->bytes_pinned,
8011 info->bytes_reserved, info->bytes_may_use,
8012 info->bytes_readonly);
8013 spin_unlock(&info->lock);
8014
8015 DUMP_BLOCK_RSV(fs_info, global_block_rsv);
8016 DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
8017 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
8018 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
8019 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
8020
8021 if (!dump_block_groups)
8022 return;
8023
8024 down_read(&info->groups_sem);
8025again:
8026 list_for_each_entry(cache, &info->block_groups[index], list) {
8027 spin_lock(&cache->lock);
8028 btrfs_info(fs_info,
8029 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
8030 cache->key.objectid, cache->key.offset,
8031 btrfs_block_group_used(&cache->item), cache->pinned,
8032 cache->reserved, cache->ro ? "[readonly]" : "");
8033 btrfs_dump_free_space(cache, bytes);
8034 spin_unlock(&cache->lock);
8035 }
8036 if (++index < BTRFS_NR_RAID_TYPES)
8037 goto again;
8038 up_read(&info->groups_sem);
8039}
8040
8041/* 5862/*
8042 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a 5863 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
8043 * hole that is at least as big as @num_bytes. 5864 * hole that is at least as big as @num_bytes.
@@ -8113,12 +5934,13 @@ again:
8113 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 5934 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8114 struct btrfs_space_info *sinfo; 5935 struct btrfs_space_info *sinfo;
8115 5936
8116 sinfo = __find_space_info(fs_info, flags); 5937 sinfo = btrfs_find_space_info(fs_info, flags);
8117 btrfs_err(fs_info, 5938 btrfs_err(fs_info,
8118 "allocation failed flags %llu, wanted %llu", 5939 "allocation failed flags %llu, wanted %llu",
8119 flags, num_bytes); 5940 flags, num_bytes);
8120 if (sinfo) 5941 if (sinfo)
8121 dump_space_info(fs_info, sinfo, num_bytes, 1); 5942 btrfs_dump_space_info(fs_info, sinfo,
5943 num_bytes, 1);
8122 } 5944 }
8123 } 5945 }
8124 5946
@@ -8456,73 +6278,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8456 return buf; 6278 return buf;
8457} 6279}
8458 6280
8459static struct btrfs_block_rsv *
8460use_block_rsv(struct btrfs_trans_handle *trans,
8461 struct btrfs_root *root, u32 blocksize)
8462{
8463 struct btrfs_fs_info *fs_info = root->fs_info;
8464 struct btrfs_block_rsv *block_rsv;
8465 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8466 int ret;
8467 bool global_updated = false;
8468
8469 block_rsv = get_block_rsv(trans, root);
8470
8471 if (unlikely(block_rsv->size == 0))
8472 goto try_reserve;
8473again:
8474 ret = block_rsv_use_bytes(block_rsv, blocksize);
8475 if (!ret)
8476 return block_rsv;
8477
8478 if (block_rsv->failfast)
8479 return ERR_PTR(ret);
8480
8481 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8482 global_updated = true;
8483 update_global_block_rsv(fs_info);
8484 goto again;
8485 }
8486
8487 /*
8488 * The global reserve still exists to save us from ourselves, so don't
8489 * warn_on if we are short on our delayed refs reserve.
8490 */
8491 if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
8492 btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8493 static DEFINE_RATELIMIT_STATE(_rs,
8494 DEFAULT_RATELIMIT_INTERVAL * 10,
8495 /*DEFAULT_RATELIMIT_BURST*/ 1);
8496 if (__ratelimit(&_rs))
8497 WARN(1, KERN_DEBUG
8498 "BTRFS: block rsv returned %d\n", ret);
8499 }
8500try_reserve:
8501 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8502 BTRFS_RESERVE_NO_FLUSH);
8503 if (!ret)
8504 return block_rsv;
8505 /*
8506 * If we couldn't reserve metadata bytes try and use some from
8507 * the global reserve if its space type is the same as the global
8508 * reservation.
8509 */
8510 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8511 block_rsv->space_info == global_rsv->space_info) {
8512 ret = block_rsv_use_bytes(global_rsv, blocksize);
8513 if (!ret)
8514 return global_rsv;
8515 }
8516 return ERR_PTR(ret);
8517}
8518
8519static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8520 struct btrfs_block_rsv *block_rsv, u32 blocksize)
8521{
8522 block_rsv_add_bytes(block_rsv, blocksize, false);
8523 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
8524}
8525
8526/* 6281/*
8527 * finds a free extent and does all the dirty work required for allocation 6282 * finds a free extent and does all the dirty work required for allocation
8528 * returns the tree buffer or an ERR_PTR on error. 6283 * returns the tree buffer or an ERR_PTR on error.
@@ -8555,7 +6310,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8555 } 6310 }
8556#endif 6311#endif
8557 6312
8558 block_rsv = use_block_rsv(trans, root, blocksize); 6313 block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
8559 if (IS_ERR(block_rsv)) 6314 if (IS_ERR(block_rsv))
8560 return ERR_CAST(block_rsv); 6315 return ERR_CAST(block_rsv);
8561 6316
@@ -8613,7 +6368,7 @@ out_free_buf:
8613out_free_reserved: 6368out_free_reserved:
8614 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); 6369 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
8615out_unuse: 6370out_unuse:
8616 unuse_block_rsv(fs_info, block_rsv, blocksize); 6371 btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
8617 return ERR_PTR(ret); 6372 return ERR_PTR(ret);
8618} 6373}
8619 6374
@@ -9552,9 +7307,8 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9552 7307
9553 num_devices = fs_info->fs_devices->rw_devices; 7308 num_devices = fs_info->fs_devices->rw_devices;
9554 7309
9555 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7310 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
9556 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 7311 BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
9557 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9558 7312
9559 if (num_devices == 1) { 7313 if (num_devices == 1) {
9560 stripped |= BTRFS_BLOCK_GROUP_DUP; 7314 stripped |= BTRFS_BLOCK_GROUP_DUP;
@@ -9565,7 +7319,7 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9565 return stripped; 7319 return stripped;
9566 7320
9567 /* turn mirroring into duplication */ 7321 /* turn mirroring into duplication */
9568 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 7322 if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
9569 BTRFS_BLOCK_GROUP_RAID10)) 7323 BTRFS_BLOCK_GROUP_RAID10))
9570 return stripped | BTRFS_BLOCK_GROUP_DUP; 7324 return stripped | BTRFS_BLOCK_GROUP_DUP;
9571 } else { 7325 } else {
@@ -9636,7 +7390,7 @@ out:
9636 btrfs_info(cache->fs_info, 7390 btrfs_info(cache->fs_info,
9637 "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", 7391 "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
9638 sinfo_used, num_bytes, min_allocable_bytes); 7392 sinfo_used, num_bytes, min_allocable_bytes);
9639 dump_space_info(cache->fs_info, cache->space_info, 0, 0); 7393 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
9640 } 7394 }
9641 return ret; 7395 return ret;
9642} 7396}
@@ -9678,8 +7432,7 @@ again:
9678 */ 7432 */
9679 alloc_flags = update_block_group_flags(fs_info, cache->flags); 7433 alloc_flags = update_block_group_flags(fs_info, cache->flags);
9680 if (alloc_flags != cache->flags) { 7434 if (alloc_flags != cache->flags) {
9681 ret = do_chunk_alloc(trans, alloc_flags, 7435 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9682 CHUNK_ALLOC_FORCE);
9683 /* 7436 /*
9684 * ENOSPC is allowed here, we may have enough space 7437 * ENOSPC is allowed here, we may have enough space
9685 * already allocated at the new raid level to 7438 * already allocated at the new raid level to
@@ -9695,7 +7448,7 @@ again:
9695 if (!ret) 7448 if (!ret)
9696 goto out; 7449 goto out;
9697 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); 7450 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9698 ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 7451 ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9699 if (ret < 0) 7452 if (ret < 0)
9700 goto out; 7453 goto out;
9701 ret = inc_block_group_ro(cache, 0); 7454 ret = inc_block_group_ro(cache, 0);
@@ -9716,7 +7469,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
9716{ 7469{
9717 u64 alloc_flags = get_alloc_profile(trans->fs_info, type); 7470 u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
9718 7471
9719 return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); 7472 return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9720} 7473}
9721 7474
9722/* 7475/*
@@ -9949,7 +7702,7 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
9949 struct extent_map_tree *em_tree; 7702 struct extent_map_tree *em_tree;
9950 struct extent_map *em; 7703 struct extent_map *em;
9951 7704
9952 em_tree = &root->fs_info->mapping_tree.map_tree; 7705 em_tree = &root->fs_info->mapping_tree;
9953 read_lock(&em_tree->lock); 7706 read_lock(&em_tree->lock);
9954 em = lookup_extent_mapping(em_tree, found_key.objectid, 7707 em = lookup_extent_mapping(em_tree, found_key.objectid,
9955 found_key.offset); 7708 found_key.offset);
@@ -10102,7 +7855,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
10102 */ 7855 */
10103 synchronize_rcu(); 7856 synchronize_rcu();
10104 7857
10105 release_global_block_rsv(info); 7858 btrfs_release_global_block_rsv(info);
10106 7859
10107 while (!list_empty(&info->space_info)) { 7860 while (!list_empty(&info->space_info)) {
10108 int i; 7861 int i;
@@ -10118,7 +7871,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
10118 if (WARN_ON(space_info->bytes_pinned > 0 || 7871 if (WARN_ON(space_info->bytes_pinned > 0 ||
10119 space_info->bytes_reserved > 0 || 7872 space_info->bytes_reserved > 0 ||
10120 space_info->bytes_may_use > 0)) 7873 space_info->bytes_may_use > 0))
10121 dump_space_info(info, space_info, 0, 0); 7874 btrfs_dump_space_info(info, space_info, 0, 0);
10122 list_del(&space_info->list); 7875 list_del(&space_info->list);
10123 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 7876 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
10124 struct kobject *kobj; 7877 struct kobject *kobj;
@@ -10141,7 +7894,6 @@ void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
10141 struct btrfs_space_info *space_info; 7894 struct btrfs_space_info *space_info;
10142 struct raid_kobject *rkobj; 7895 struct raid_kobject *rkobj;
10143 LIST_HEAD(list); 7896 LIST_HEAD(list);
10144 int index;
10145 int ret = 0; 7897 int ret = 0;
10146 7898
10147 spin_lock(&fs_info->pending_raid_kobjs_lock); 7899 spin_lock(&fs_info->pending_raid_kobjs_lock);
@@ -10149,11 +7901,10 @@ void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
10149 spin_unlock(&fs_info->pending_raid_kobjs_lock); 7901 spin_unlock(&fs_info->pending_raid_kobjs_lock);
10150 7902
10151 list_for_each_entry(rkobj, &list, list) { 7903 list_for_each_entry(rkobj, &list, list) {
10152 space_info = __find_space_info(fs_info, rkobj->flags); 7904 space_info = btrfs_find_space_info(fs_info, rkobj->flags);
10153 index = btrfs_bg_flags_to_raid_index(rkobj->flags);
10154 7905
10155 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 7906 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
10156 "%s", get_raid_name(index)); 7907 "%s", btrfs_bg_type_to_raid_name(rkobj->flags));
10157 if (ret) { 7908 if (ret) {
10158 kobject_put(&rkobj->kobj); 7909 kobject_put(&rkobj->kobj);
10159 break; 7910 break;
@@ -10243,21 +7994,21 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
10243 */ 7994 */
10244static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) 7995static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
10245{ 7996{
10246 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 7997 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
10247 struct extent_map *em; 7998 struct extent_map *em;
10248 struct btrfs_block_group_cache *bg; 7999 struct btrfs_block_group_cache *bg;
10249 u64 start = 0; 8000 u64 start = 0;
10250 int ret = 0; 8001 int ret = 0;
10251 8002
10252 while (1) { 8003 while (1) {
10253 read_lock(&map_tree->map_tree.lock); 8004 read_lock(&map_tree->lock);
10254 /* 8005 /*
10255 * lookup_extent_mapping will return the first extent map 8006 * lookup_extent_mapping will return the first extent map
10256 * intersecting the range, so setting @len to 1 is enough to 8007 * intersecting the range, so setting @len to 1 is enough to
10257 * get the first chunk. 8008 * get the first chunk.
10258 */ 8009 */
10259 em = lookup_extent_mapping(&map_tree->map_tree, start, 1); 8010 em = lookup_extent_mapping(map_tree, start, 1);
10260 read_unlock(&map_tree->map_tree.lock); 8011 read_unlock(&map_tree->lock);
10261 if (!em) 8012 if (!em)
10262 break; 8013 break;
10263 8014
@@ -10417,9 +8168,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
10417 } 8168 }
10418 8169
10419 trace_btrfs_add_block_group(info, cache, 0); 8170 trace_btrfs_add_block_group(info, cache, 0);
10420 update_space_info(info, cache->flags, found_key.offset, 8171 btrfs_update_space_info(info, cache->flags, found_key.offset,
10421 btrfs_block_group_used(&cache->item), 8172 btrfs_block_group_used(&cache->item),
10422 cache->bytes_super, &space_info); 8173 cache->bytes_super, &space_info);
10423 8174
10424 cache->space_info = space_info; 8175 cache->space_info = space_info;
10425 8176
@@ -10437,9 +8188,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
10437 list_for_each_entry_rcu(space_info, &info->space_info, list) { 8188 list_for_each_entry_rcu(space_info, &info->space_info, list) {
10438 if (!(get_alloc_profile(info, space_info->flags) & 8189 if (!(get_alloc_profile(info, space_info->flags) &
10439 (BTRFS_BLOCK_GROUP_RAID10 | 8190 (BTRFS_BLOCK_GROUP_RAID10 |
10440 BTRFS_BLOCK_GROUP_RAID1 | 8191 BTRFS_BLOCK_GROUP_RAID1_MASK |
10441 BTRFS_BLOCK_GROUP_RAID5 | 8192 BTRFS_BLOCK_GROUP_RAID56_MASK |
10442 BTRFS_BLOCK_GROUP_RAID6 |
10443 BTRFS_BLOCK_GROUP_DUP))) 8193 BTRFS_BLOCK_GROUP_DUP)))
10444 continue; 8194 continue;
10445 /* 8195 /*
@@ -10457,7 +8207,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
10457 } 8207 }
10458 8208
10459 btrfs_add_raid_kobjects(info); 8209 btrfs_add_raid_kobjects(info);
10460 init_global_block_rsv(info); 8210 btrfs_init_global_block_rsv(info);
10461 ret = check_chunk_block_group_mappings(info); 8211 ret = check_chunk_block_group_mappings(info);
10462error: 8212error:
10463 btrfs_free_path(path); 8213 btrfs_free_path(path);
@@ -10554,7 +8304,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10554 * assigned to our block group. We want our bg to be added to the rbtree 8304 * assigned to our block group. We want our bg to be added to the rbtree
10555 * with its ->space_info set. 8305 * with its ->space_info set.
10556 */ 8306 */
10557 cache->space_info = __find_space_info(fs_info, cache->flags); 8307 cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
10558 ASSERT(cache->space_info); 8308 ASSERT(cache->space_info);
10559 8309
10560 ret = btrfs_add_block_group_cache(fs_info, cache); 8310 ret = btrfs_add_block_group_cache(fs_info, cache);
@@ -10569,9 +8319,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10569 * the rbtree, update the space info's counters. 8319 * the rbtree, update the space info's counters.
10570 */ 8320 */
10571 trace_btrfs_add_block_group(fs_info, cache, 1); 8321 trace_btrfs_add_block_group(fs_info, cache, 1);
10572 update_space_info(fs_info, cache->flags, size, bytes_used, 8322 btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
10573 cache->bytes_super, &cache->space_info); 8323 cache->bytes_super, &cache->space_info);
10574 update_global_block_rsv(fs_info); 8324 btrfs_update_global_block_rsv(fs_info);
10575 8325
10576 link_block_group(cache); 8326 link_block_group(cache);
10577 8327
@@ -10598,6 +8348,35 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10598 write_sequnlock(&fs_info->profiles_lock); 8348 write_sequnlock(&fs_info->profiles_lock);
10599} 8349}
10600 8350
8351/*
8352 * Clear incompat bits for the following feature(s):
8353 *
8354 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
8355 * in the whole filesystem
8356 */
8357static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
8358{
8359 if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
8360 struct list_head *head = &fs_info->space_info;
8361 struct btrfs_space_info *sinfo;
8362
8363 list_for_each_entry_rcu(sinfo, head, list) {
8364 bool found = false;
8365
8366 down_read(&sinfo->groups_sem);
8367 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
8368 found = true;
8369 if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
8370 found = true;
8371 up_read(&sinfo->groups_sem);
8372
8373 if (found)
8374 return;
8375 }
8376 btrfs_clear_fs_incompat(fs_info, RAID56);
8377 }
8378}
8379
10601int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 8380int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10602 u64 group_start, struct extent_map *em) 8381 u64 group_start, struct extent_map *em)
10603{ 8382{
@@ -10744,6 +8523,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10744 clear_avail_alloc_bits(fs_info, block_group->flags); 8523 clear_avail_alloc_bits(fs_info, block_group->flags);
10745 } 8524 }
10746 up_write(&block_group->space_info->groups_sem); 8525 up_write(&block_group->space_info->groups_sem);
8526 clear_incompat_bg_bits(fs_info, block_group->flags);
10747 if (kobj) { 8527 if (kobj) {
10748 kobject_del(kobj); 8528 kobject_del(kobj);
10749 kobject_put(kobj); 8529 kobject_put(kobj);
@@ -10853,7 +8633,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10853 if (remove_em) { 8633 if (remove_em) {
10854 struct extent_map_tree *em_tree; 8634 struct extent_map_tree *em_tree;
10855 8635
10856 em_tree = &fs_info->mapping_tree.map_tree; 8636 em_tree = &fs_info->mapping_tree;
10857 write_lock(&em_tree->lock); 8637 write_lock(&em_tree->lock);
10858 remove_extent_mapping(em_tree, em); 8638 remove_extent_mapping(em_tree, em);
10859 write_unlock(&em_tree->lock); 8639 write_unlock(&em_tree->lock);
@@ -10871,7 +8651,7 @@ struct btrfs_trans_handle *
10871btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, 8651btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10872 const u64 chunk_offset) 8652 const u64 chunk_offset)
10873{ 8653{
10874 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 8654 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
10875 struct extent_map *em; 8655 struct extent_map *em;
10876 struct map_lookup *map; 8656 struct map_lookup *map;
10877 unsigned int num_items; 8657 unsigned int num_items;
@@ -11020,7 +8800,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
11020 spin_lock(&space_info->lock); 8800 spin_lock(&space_info->lock);
11021 spin_lock(&block_group->lock); 8801 spin_lock(&block_group->lock);
11022 8802
11023 update_bytes_pinned(space_info, -block_group->pinned); 8803 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
8804 -block_group->pinned);
11024 space_info->bytes_readonly += block_group->pinned; 8805 space_info->bytes_readonly += block_group->pinned;
11025 percpu_counter_add_batch(&space_info->total_bytes_pinned, 8806 percpu_counter_add_batch(&space_info->total_bytes_pinned,
11026 -block_group->pinned, 8807 -block_group->pinned,
@@ -11076,43 +8857,6 @@ next:
11076 spin_unlock(&fs_info->unused_bgs_lock); 8857 spin_unlock(&fs_info->unused_bgs_lock);
11077} 8858}
11078 8859
11079int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
11080{
11081 struct btrfs_super_block *disk_super;
11082 u64 features;
11083 u64 flags;
11084 int mixed = 0;
11085 int ret;
11086
11087 disk_super = fs_info->super_copy;
11088 if (!btrfs_super_root(disk_super))
11089 return -EINVAL;
11090
11091 features = btrfs_super_incompat_flags(disk_super);
11092 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
11093 mixed = 1;
11094
11095 flags = BTRFS_BLOCK_GROUP_SYSTEM;
11096 ret = create_space_info(fs_info, flags);
11097 if (ret)
11098 goto out;
11099
11100 if (mixed) {
11101 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
11102 ret = create_space_info(fs_info, flags);
11103 } else {
11104 flags = BTRFS_BLOCK_GROUP_METADATA;
11105 ret = create_space_info(fs_info, flags);
11106 if (ret)
11107 goto out;
11108
11109 flags = BTRFS_BLOCK_GROUP_DATA;
11110 ret = create_space_info(fs_info, flags);
11111 }
11112out:
11113 return ret;
11114}
11115
11116int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, 8860int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
11117 u64 start, u64 end) 8861 u64 start, u64 end)
11118{ 8862{
@@ -11171,12 +8915,17 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
11171 find_first_clear_extent_bit(&device->alloc_state, start, 8915 find_first_clear_extent_bit(&device->alloc_state, start,
11172 &start, &end, 8916 &start, &end,
11173 CHUNK_TRIMMED | CHUNK_ALLOCATED); 8917 CHUNK_TRIMMED | CHUNK_ALLOCATED);
8918
8919 /* Ensure we skip the reserved area in the first 1M */
8920 start = max_t(u64, start, SZ_1M);
8921
11174 /* 8922 /*
11175 * If find_first_clear_extent_bit find a range that spans the 8923 * If find_first_clear_extent_bit find a range that spans the
11176 * end of the device it will set end to -1, in this case it's up 8924 * end of the device it will set end to -1, in this case it's up
11177 * to the caller to trim the value to the size of the device. 8925 * to the caller to trim the value to the size of the device.
11178 */ 8926 */
11179 end = min(end, device->total_bytes - 1); 8927 end = min(end, device->total_bytes - 1);
8928
11180 len = end - start + 1; 8929 len = end - start + 1;
11181 8930
11182 /* We didn't find any extents */ 8931 /* We didn't find any extents */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5106008f5e28..1ff438fd5bc2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -359,6 +359,24 @@ do_insert:
359 return NULL; 359 return NULL;
360} 360}
361 361
362/**
363 * __etree_search - searche @tree for an entry that contains @offset. Such
364 * entry would have entry->start <= offset && entry->end >= offset.
365 *
366 * @tree - the tree to search
367 * @offset - offset that should fall within an entry in @tree
368 * @next_ret - pointer to the first entry whose range ends after @offset
369 * @prev - pointer to the first entry whose range begins before @offset
370 * @p_ret - pointer where new node should be anchored (used when inserting an
371 * entry in the tree)
372 * @parent_ret - points to entry which would have been the parent of the entry,
373 * containing @offset
374 *
375 * This function returns a pointer to the entry that contains @offset byte
376 * address. If no such entry exists, then NULL is returned and the other
377 * pointer arguments to the function are filled, otherwise the found entry is
378 * returned and other pointers are left untouched.
379 */
362static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 380static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
363 struct rb_node **next_ret, 381 struct rb_node **next_ret,
364 struct rb_node **prev_ret, 382 struct rb_node **prev_ret,
@@ -504,9 +522,11 @@ static int insert_state(struct extent_io_tree *tree,
504{ 522{
505 struct rb_node *node; 523 struct rb_node *node;
506 524
507 if (end < start) 525 if (end < start) {
508 WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", 526 btrfs_err(tree->fs_info,
509 end, start); 527 "insert state: end < start %llu %llu", end, start);
528 WARN_ON(1);
529 }
510 state->start = start; 530 state->start = start;
511 state->end = end; 531 state->end = end;
512 532
@@ -516,7 +536,8 @@ static int insert_state(struct extent_io_tree *tree,
516 if (node) { 536 if (node) {
517 struct extent_state *found; 537 struct extent_state *found;
518 found = rb_entry(node, struct extent_state, rb_node); 538 found = rb_entry(node, struct extent_state, rb_node);
519 pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", 539 btrfs_err(tree->fs_info,
540 "found node %llu %llu on insert of %llu %llu",
520 found->start, found->end, start, end); 541 found->start, found->end, start, end);
521 return -EEXIST; 542 return -EEXIST;
522 } 543 }
@@ -1537,8 +1558,8 @@ out:
1537} 1558}
1538 1559
1539/** 1560/**
1540 * find_first_clear_extent_bit - finds the first range that has @bits not set 1561 * find_first_clear_extent_bit - find the first range that has @bits not set.
1541 * and that starts after @start 1562 * This range could start before @start.
1542 * 1563 *
1543 * @tree - the tree to search 1564 * @tree - the tree to search
1544 * @start - the offset at/after which the found extent should start 1565 * @start - the offset at/after which the found extent should start
@@ -1578,12 +1599,52 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1578 goto out; 1599 goto out;
1579 } 1600 }
1580 } 1601 }
1602 /*
1603 * At this point 'node' either contains 'start' or start is
1604 * before 'node'
1605 */
1581 state = rb_entry(node, struct extent_state, rb_node); 1606 state = rb_entry(node, struct extent_state, rb_node);
1582 if (in_range(start, state->start, state->end - state->start + 1) && 1607
1583 (state->state & bits)) { 1608 if (in_range(start, state->start, state->end - state->start + 1)) {
1584 start = state->end + 1; 1609 if (state->state & bits) {
1610 /*
1611 * |--range with bits sets--|
1612 * |
1613 * start
1614 */
1615 start = state->end + 1;
1616 } else {
1617 /*
1618 * 'start' falls within a range that doesn't
1619 * have the bits set, so take its start as
1620 * the beginning of the desired range
1621 *
1622 * |--range with bits cleared----|
1623 * |
1624 * start
1625 */
1626 *start_ret = state->start;
1627 break;
1628 }
1585 } else { 1629 } else {
1586 *start_ret = start; 1630 /*
1631 * |---prev range---|---hole/unset---|---node range---|
1632 * |
1633 * start
1634 *
1635 * or
1636 *
1637 * |---hole/unset--||--first node--|
1638 * 0 |
1639 * start
1640 */
1641 if (prev) {
1642 state = rb_entry(prev, struct extent_state,
1643 rb_node);
1644 *start_ret = state->end + 1;
1645 } else {
1646 *start_ret = 0;
1647 }
1587 break; 1648 break;
1588 } 1649 }
1589 } 1650 }
@@ -1719,10 +1780,10 @@ static noinline int lock_delalloc_pages(struct inode *inode,
1719 */ 1780 */
1720EXPORT_FOR_TESTS 1781EXPORT_FOR_TESTS
1721noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, 1782noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
1722 struct extent_io_tree *tree,
1723 struct page *locked_page, u64 *start, 1783 struct page *locked_page, u64 *start,
1724 u64 *end) 1784 u64 *end)
1725{ 1785{
1786 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1726 u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; 1787 u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
1727 u64 delalloc_start; 1788 u64 delalloc_start;
1728 u64 delalloc_end; 1789 u64 delalloc_end;
@@ -2800,12 +2861,11 @@ static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
2800 * never fail. We're returning a bio right now but you can call btrfs_io_bio 2861 * never fail. We're returning a bio right now but you can call btrfs_io_bio
2801 * for the appropriate container_of magic 2862 * for the appropriate container_of magic
2802 */ 2863 */
2803struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) 2864struct bio *btrfs_bio_alloc(u64 first_byte)
2804{ 2865{
2805 struct bio *bio; 2866 struct bio *bio;
2806 2867
2807 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); 2868 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
2808 bio_set_dev(bio, bdev);
2809 bio->bi_iter.bi_sector = first_byte >> 9; 2869 bio->bi_iter.bi_sector = first_byte >> 9;
2810 btrfs_io_bio_init(btrfs_io_bio(bio)); 2870 btrfs_io_bio_init(btrfs_io_bio(bio));
2811 return bio; 2871 return bio;
@@ -2916,7 +2976,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
2916 } 2976 }
2917 } 2977 }
2918 2978
2919 bio = btrfs_bio_alloc(bdev, offset); 2979 bio = btrfs_bio_alloc(offset);
2980 bio_set_dev(bio, bdev);
2920 bio_add_page(bio, page, page_size, pg_offset); 2981 bio_add_page(bio, page, page_size, pg_offset);
2921 bio->bi_end_io = end_io_func; 2982 bio->bi_end_io = end_io_func;
2922 bio->bi_private = tree; 2983 bio->bi_private = tree;
@@ -3204,21 +3265,10 @@ static inline void contiguous_readpages(struct extent_io_tree *tree,
3204 unsigned long *bio_flags, 3265 unsigned long *bio_flags,
3205 u64 *prev_em_start) 3266 u64 *prev_em_start)
3206{ 3267{
3207 struct inode *inode; 3268 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
3208 struct btrfs_ordered_extent *ordered;
3209 int index; 3269 int index;
3210 3270
3211 inode = pages[0]->mapping->host; 3271 btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
3212 while (1) {
3213 lock_extent(tree, start, end);
3214 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
3215 end - start + 1);
3216 if (!ordered)
3217 break;
3218 unlock_extent(tree, start, end);
3219 btrfs_start_ordered_extent(inode, ordered, 1);
3220 btrfs_put_ordered_extent(ordered);
3221 }
3222 3272
3223 for (index = 0; index < nr_pages; index++) { 3273 for (index = 0; index < nr_pages; index++) {
3224 __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, 3274 __do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
@@ -3234,22 +3284,12 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
3234 unsigned long *bio_flags, 3284 unsigned long *bio_flags,
3235 unsigned int read_flags) 3285 unsigned int read_flags)
3236{ 3286{
3237 struct inode *inode = page->mapping->host; 3287 struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
3238 struct btrfs_ordered_extent *ordered;
3239 u64 start = page_offset(page); 3288 u64 start = page_offset(page);
3240 u64 end = start + PAGE_SIZE - 1; 3289 u64 end = start + PAGE_SIZE - 1;
3241 int ret; 3290 int ret;
3242 3291
3243 while (1) { 3292 btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
3244 lock_extent(tree, start, end);
3245 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
3246 PAGE_SIZE);
3247 if (!ordered)
3248 break;
3249 unlock_extent(tree, start, end);
3250 btrfs_start_ordered_extent(inode, ordered, 1);
3251 btrfs_put_ordered_extent(ordered);
3252 }
3253 3293
3254 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3294 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
3255 bio_flags, read_flags, NULL); 3295 bio_flags, read_flags, NULL);
@@ -3290,7 +3330,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
3290 struct page *page, struct writeback_control *wbc, 3330 struct page *page, struct writeback_control *wbc,
3291 u64 delalloc_start, unsigned long *nr_written) 3331 u64 delalloc_start, unsigned long *nr_written)
3292{ 3332{
3293 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
3294 u64 page_end = delalloc_start + PAGE_SIZE - 1; 3333 u64 page_end = delalloc_start + PAGE_SIZE - 1;
3295 bool found; 3334 bool found;
3296 u64 delalloc_to_write = 0; 3335 u64 delalloc_to_write = 0;
@@ -3300,8 +3339,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
3300 3339
3301 3340
3302 while (delalloc_end < page_end) { 3341 while (delalloc_end < page_end) {
3303 found = find_lock_delalloc_range(inode, tree, 3342 found = find_lock_delalloc_range(inode, page,
3304 page,
3305 &delalloc_start, 3343 &delalloc_start,
3306 &delalloc_end); 3344 &delalloc_end);
3307 if (!found) { 3345 if (!found) {
@@ -3310,7 +3348,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
3310 } 3348 }
3311 ret = btrfs_run_delalloc_range(inode, page, delalloc_start, 3349 ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
3312 delalloc_end, &page_started, nr_written, wbc); 3350 delalloc_end, &page_started, nr_written, wbc);
3313 /* File system has been set read-only */
3314 if (ret) { 3351 if (ret) {
3315 SetPageError(page); 3352 SetPageError(page);
3316 /* 3353 /*
@@ -4542,6 +4579,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4542 struct btrfs_path *path; 4579 struct btrfs_path *path;
4543 struct btrfs_root *root = BTRFS_I(inode)->root; 4580 struct btrfs_root *root = BTRFS_I(inode)->root;
4544 struct fiemap_cache cache = { 0 }; 4581 struct fiemap_cache cache = { 0 };
4582 struct ulist *roots;
4583 struct ulist *tmp_ulist;
4545 int end = 0; 4584 int end = 0;
4546 u64 em_start = 0; 4585 u64 em_start = 0;
4547 u64 em_len = 0; 4586 u64 em_len = 0;
@@ -4555,6 +4594,13 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4555 return -ENOMEM; 4594 return -ENOMEM;
4556 path->leave_spinning = 1; 4595 path->leave_spinning = 1;
4557 4596
4597 roots = ulist_alloc(GFP_KERNEL);
4598 tmp_ulist = ulist_alloc(GFP_KERNEL);
4599 if (!roots || !tmp_ulist) {
4600 ret = -ENOMEM;
4601 goto out_free_ulist;
4602 }
4603
4558 start = round_down(start, btrfs_inode_sectorsize(inode)); 4604 start = round_down(start, btrfs_inode_sectorsize(inode));
4559 len = round_up(max, btrfs_inode_sectorsize(inode)) - start; 4605 len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
4560 4606
@@ -4565,8 +4611,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4565 ret = btrfs_lookup_file_extent(NULL, root, path, 4611 ret = btrfs_lookup_file_extent(NULL, root, path,
4566 btrfs_ino(BTRFS_I(inode)), -1, 0); 4612 btrfs_ino(BTRFS_I(inode)), -1, 0);
4567 if (ret < 0) { 4613 if (ret < 0) {
4568 btrfs_free_path(path); 4614 goto out_free_ulist;
4569 return ret;
4570 } else { 4615 } else {
4571 WARN_ON(!ret); 4616 WARN_ON(!ret);
4572 if (ret == 1) 4617 if (ret == 1)
@@ -4675,7 +4720,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4675 */ 4720 */
4676 ret = btrfs_check_shared(root, 4721 ret = btrfs_check_shared(root,
4677 btrfs_ino(BTRFS_I(inode)), 4722 btrfs_ino(BTRFS_I(inode)),
4678 bytenr); 4723 bytenr, roots, tmp_ulist);
4679 if (ret < 0) 4724 if (ret < 0)
4680 goto out_free; 4725 goto out_free;
4681 if (ret) 4726 if (ret)
@@ -4718,9 +4763,13 @@ out_free:
4718 ret = emit_last_fiemap_cache(fieinfo, &cache); 4763 ret = emit_last_fiemap_cache(fieinfo, &cache);
4719 free_extent_map(em); 4764 free_extent_map(em);
4720out: 4765out:
4721 btrfs_free_path(path);
4722 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4766 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
4723 &cached_state); 4767 &cached_state);
4768
4769out_free_ulist:
4770 btrfs_free_path(path);
4771 ulist_free(roots);
4772 ulist_free(tmp_ulist);
4724 return ret; 4773 return ret;
4725} 4774}
4726 4775
@@ -4808,7 +4857,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
4808 eb->bflags = 0; 4857 eb->bflags = 0;
4809 rwlock_init(&eb->lock); 4858 rwlock_init(&eb->lock);
4810 atomic_set(&eb->blocking_readers, 0); 4859 atomic_set(&eb->blocking_readers, 0);
4811 atomic_set(&eb->blocking_writers, 0); 4860 eb->blocking_writers = 0;
4812 eb->lock_nested = false; 4861 eb->lock_nested = false;
4813 init_waitqueue_head(&eb->write_lock_wq); 4862 init_waitqueue_head(&eb->write_lock_wq);
4814 init_waitqueue_head(&eb->read_lock_wq); 4863 init_waitqueue_head(&eb->read_lock_wq);
@@ -4827,10 +4876,10 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
4827 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4876 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
4828 4877
4829#ifdef CONFIG_BTRFS_DEBUG 4878#ifdef CONFIG_BTRFS_DEBUG
4830 atomic_set(&eb->spinning_writers, 0); 4879 eb->spinning_writers = 0;
4831 atomic_set(&eb->spinning_readers, 0); 4880 atomic_set(&eb->spinning_readers, 0);
4832 atomic_set(&eb->read_locks, 0); 4881 atomic_set(&eb->read_locks, 0);
4833 atomic_set(&eb->write_locks, 0); 4882 eb->write_locks = 0;
4834#endif 4883#endif
4835 4884
4836 return eb; 4885 return eb;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index aa18a16a6ed7..401423b16976 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -167,7 +167,7 @@ struct extent_buffer {
167 struct rcu_head rcu_head; 167 struct rcu_head rcu_head;
168 pid_t lock_owner; 168 pid_t lock_owner;
169 169
170 atomic_t blocking_writers; 170 int blocking_writers;
171 atomic_t blocking_readers; 171 atomic_t blocking_readers;
172 bool lock_nested; 172 bool lock_nested;
173 /* >= 0 if eb belongs to a log tree, -1 otherwise */ 173 /* >= 0 if eb belongs to a log tree, -1 otherwise */
@@ -187,10 +187,10 @@ struct extent_buffer {
187 wait_queue_head_t read_lock_wq; 187 wait_queue_head_t read_lock_wq;
188 struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; 188 struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
189#ifdef CONFIG_BTRFS_DEBUG 189#ifdef CONFIG_BTRFS_DEBUG
190 atomic_t spinning_writers; 190 int spinning_writers;
191 atomic_t spinning_readers; 191 atomic_t spinning_readers;
192 atomic_t read_locks; 192 atomic_t read_locks;
193 atomic_t write_locks; 193 int write_locks;
194 struct list_head leak_list; 194 struct list_head leak_list;
195#endif 195#endif
196}; 196};
@@ -497,7 +497,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
497 u64 delalloc_end, struct page *locked_page, 497 u64 delalloc_end, struct page *locked_page,
498 unsigned bits_to_clear, 498 unsigned bits_to_clear,
499 unsigned long page_ops); 499 unsigned long page_ops);
500struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte); 500struct bio *btrfs_bio_alloc(u64 first_byte);
501struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); 501struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
502struct bio *btrfs_bio_clone(struct bio *bio); 502struct bio *btrfs_bio_clone(struct bio *bio);
503struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); 503struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
@@ -549,7 +549,7 @@ int free_io_failure(struct extent_io_tree *failure_tree,
549 struct extent_io_tree *io_tree, 549 struct extent_io_tree *io_tree,
550 struct io_failure_record *rec); 550 struct io_failure_record *rec);
551#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 551#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
552bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, 552bool find_lock_delalloc_range(struct inode *inode,
553 struct page *locked_page, u64 *start, 553 struct page *locked_page, u64 *start,
554 u64 *end); 554 u64 *end);
555#endif 555#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d431ea8198e4..1a599f50837b 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -8,6 +8,7 @@
8#include <linux/pagemap.h> 8#include <linux/pagemap.h>
9#include <linux/highmem.h> 9#include <linux/highmem.h>
10#include <linux/sched/mm.h> 10#include <linux/sched/mm.h>
11#include <crypto/hash.h>
11#include "ctree.h" 12#include "ctree.h"
12#include "disk-io.h" 13#include "disk-io.h"
13#include "transaction.h" 14#include "transaction.h"
@@ -22,9 +23,13 @@
22#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ 23#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
23 PAGE_SIZE)) 24 PAGE_SIZE))
24 25
25#define MAX_ORDERED_SUM_BYTES(fs_info) ((PAGE_SIZE - \ 26static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
26 sizeof(struct btrfs_ordered_sum)) / \ 27 u16 csum_size)
27 sizeof(u32) * (fs_info)->sectorsize) 28{
29 u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size;
30
31 return ncsums * fs_info->sectorsize;
32}
28 33
29int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 34int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
30 struct btrfs_root *root, 35 struct btrfs_root *root,
@@ -144,7 +149,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
144} 149}
145 150
146static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, 151static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
147 u64 logical_offset, u32 *dst, int dio) 152 u64 logical_offset, u8 *dst, int dio)
148{ 153{
149 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 154 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
150 struct bio_vec bvec; 155 struct bio_vec bvec;
@@ -182,7 +187,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
182 } 187 }
183 csum = btrfs_bio->csum; 188 csum = btrfs_bio->csum;
184 } else { 189 } else {
185 csum = (u8 *)dst; 190 csum = dst;
186 } 191 }
187 192
188 if (bio->bi_iter.bi_size > PAGE_SIZE * 8) 193 if (bio->bi_iter.bi_size > PAGE_SIZE * 8)
@@ -211,7 +216,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
211 if (!dio) 216 if (!dio)
212 offset = page_offset(bvec.bv_page) + bvec.bv_offset; 217 offset = page_offset(bvec.bv_page) + bvec.bv_offset;
213 count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, 218 count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
214 (u32 *)csum, nblocks); 219 csum, nblocks);
215 if (count) 220 if (count)
216 goto found; 221 goto found;
217 222
@@ -283,7 +288,8 @@ next:
283 return 0; 288 return 0;
284} 289}
285 290
286blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) 291blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
292 u8 *dst)
287{ 293{
288 return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); 294 return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0);
289} 295}
@@ -374,7 +380,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
374 struct btrfs_csum_item); 380 struct btrfs_csum_item);
375 while (start < csum_end) { 381 while (start < csum_end) {
376 size = min_t(size_t, csum_end - start, 382 size = min_t(size_t, csum_end - start,
377 MAX_ORDERED_SUM_BYTES(fs_info)); 383 max_ordered_sum_bytes(fs_info, csum_size));
378 sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), 384 sums = kzalloc(btrfs_ordered_sum_size(fs_info, size),
379 GFP_NOFS); 385 GFP_NOFS);
380 if (!sums) { 386 if (!sums) {
@@ -427,6 +433,7 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
427 u64 file_start, int contig) 433 u64 file_start, int contig)
428{ 434{
429 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 435 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
436 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
430 struct btrfs_ordered_sum *sums; 437 struct btrfs_ordered_sum *sums;
431 struct btrfs_ordered_extent *ordered = NULL; 438 struct btrfs_ordered_extent *ordered = NULL;
432 char *data; 439 char *data;
@@ -439,6 +446,7 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
439 int i; 446 int i;
440 u64 offset; 447 u64 offset;
441 unsigned nofs_flag; 448 unsigned nofs_flag;
449 const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
442 450
443 nofs_flag = memalloc_nofs_save(); 451 nofs_flag = memalloc_nofs_save();
444 sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), 452 sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
@@ -459,6 +467,8 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
459 sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; 467 sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
460 index = 0; 468 index = 0;
461 469
470 shash->tfm = fs_info->csum_shash;
471
462 bio_for_each_segment(bvec, bio, iter) { 472 bio_for_each_segment(bvec, bio, iter) {
463 if (!contig) 473 if (!contig)
464 offset = page_offset(bvec.bv_page) + bvec.bv_offset; 474 offset = page_offset(bvec.bv_page) + bvec.bv_offset;
@@ -498,17 +508,14 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
498 index = 0; 508 index = 0;
499 } 509 }
500 510
501 sums->sums[index] = ~(u32)0; 511 crypto_shash_init(shash);
502 data = kmap_atomic(bvec.bv_page); 512 data = kmap_atomic(bvec.bv_page);
503 sums->sums[index] 513 crypto_shash_update(shash, data + bvec.bv_offset
504 = btrfs_csum_data(data + bvec.bv_offset 514 + (i * fs_info->sectorsize),
505 + (i * fs_info->sectorsize), 515 fs_info->sectorsize);
506 sums->sums[index],
507 fs_info->sectorsize);
508 kunmap_atomic(data); 516 kunmap_atomic(data);
509 btrfs_csum_final(sums->sums[index], 517 crypto_shash_final(shash, (char *)(sums->sums + index));
510 (char *)(sums->sums + index)); 518 index += csum_size;
511 index++;
512 offset += fs_info->sectorsize; 519 offset += fs_info->sectorsize;
513 this_sum_bytes += fs_info->sectorsize; 520 this_sum_bytes += fs_info->sectorsize;
514 total_bytes += fs_info->sectorsize; 521 total_bytes += fs_info->sectorsize;
@@ -904,9 +911,9 @@ found:
904 write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, 911 write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
905 ins_size); 912 ins_size);
906 913
914 index += ins_size;
907 ins_size /= csum_size; 915 ins_size /= csum_size;
908 total_bytes += ins_size * fs_info->sectorsize; 916 total_bytes += ins_size * fs_info->sectorsize;
909 index += ins_size;
910 917
911 btrfs_mark_buffer_dirty(path->nodes[0]); 918 btrfs_mark_buffer_dirty(path->nodes[0]);
912 if (total_bytes < sums->len) { 919 if (total_bytes < sums->len) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 89f5be2bfb43..58a18ed11546 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -26,6 +26,7 @@
26#include "volumes.h" 26#include "volumes.h"
27#include "qgroup.h" 27#include "qgroup.h"
28#include "compression.h" 28#include "compression.h"
29#include "delalloc-space.h"
29 30
30static struct kmem_cache *btrfs_inode_defrag_cachep; 31static struct kmem_cache *btrfs_inode_defrag_cachep;
31/* 32/*
@@ -1550,30 +1551,20 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
1550{ 1551{
1551 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1552 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1552 struct btrfs_root *root = inode->root; 1553 struct btrfs_root *root = inode->root;
1553 struct btrfs_ordered_extent *ordered;
1554 u64 lockstart, lockend; 1554 u64 lockstart, lockend;
1555 u64 num_bytes; 1555 u64 num_bytes;
1556 int ret; 1556 int ret;
1557 1557
1558 ret = btrfs_start_write_no_snapshotting(root); 1558 ret = btrfs_start_write_no_snapshotting(root);
1559 if (!ret) 1559 if (!ret)
1560 return -ENOSPC; 1560 return -EAGAIN;
1561 1561
1562 lockstart = round_down(pos, fs_info->sectorsize); 1562 lockstart = round_down(pos, fs_info->sectorsize);
1563 lockend = round_up(pos + *write_bytes, 1563 lockend = round_up(pos + *write_bytes,
1564 fs_info->sectorsize) - 1; 1564 fs_info->sectorsize) - 1;
1565 1565
1566 while (1) { 1566 btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart,
1567 lock_extent(&inode->io_tree, lockstart, lockend); 1567 lockend, NULL);
1568 ordered = btrfs_lookup_ordered_range(inode, lockstart,
1569 lockend - lockstart + 1);
1570 if (!ordered) {
1571 break;
1572 }
1573 unlock_extent(&inode->io_tree, lockstart, lockend);
1574 btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
1575 btrfs_put_ordered_extent(ordered);
1576 }
1577 1568
1578 num_bytes = lockend - lockstart + 1; 1569 num_bytes = lockend - lockstart + 1;
1579 ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, 1570 ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
@@ -2721,6 +2712,11 @@ out_only_mutex:
2721 * for detecting, at fsync time, if the inode isn't yet in the 2712 * for detecting, at fsync time, if the inode isn't yet in the
2722 * log tree or it's there but not up to date. 2713 * log tree or it's there but not up to date.
2723 */ 2714 */
2715 struct timespec64 now = current_time(inode);
2716
2717 inode_inc_iversion(inode);
2718 inode->i_mtime = now;
2719 inode->i_ctime = now;
2724 trans = btrfs_start_transaction(root, 1); 2720 trans = btrfs_start_transaction(root, 1);
2725 if (IS_ERR(trans)) { 2721 if (IS_ERR(trans)) {
2726 err = PTR_ERR(trans); 2722 err = PTR_ERR(trans);
@@ -2801,9 +2797,9 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
2801} 2797}
2802 2798
2803enum { 2799enum {
2804 RANGE_BOUNDARY_WRITTEN_EXTENT = 0, 2800 RANGE_BOUNDARY_WRITTEN_EXTENT,
2805 RANGE_BOUNDARY_PREALLOC_EXTENT = 1, 2801 RANGE_BOUNDARY_PREALLOC_EXTENT,
2806 RANGE_BOUNDARY_HOLE = 2, 2802 RANGE_BOUNDARY_HOLE,
2807}; 2803};
2808 2804
2809static int btrfs_zero_range_check_range_boundary(struct inode *inode, 2805static int btrfs_zero_range_check_range_boundary(struct inode *inode,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f74dc259307b..062be9dde4c6 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,8 @@
18#include "extent_io.h" 18#include "extent_io.h"
19#include "inode-map.h" 19#include "inode-map.h"
20#include "volumes.h" 20#include "volumes.h"
21#include "space-info.h"
22#include "delalloc-space.h"
21 23
22#define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 24#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
23#define MAX_CACHE_BYTES_PER_GIG SZ_32K 25#define MAX_CACHE_BYTES_PER_GIG SZ_32K
@@ -465,9 +467,8 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
465 if (index == 0) 467 if (index == 0)
466 offset = sizeof(u32) * io_ctl->num_pages; 468 offset = sizeof(u32) * io_ctl->num_pages;
467 469
468 crc = btrfs_csum_data(io_ctl->orig + offset, crc, 470 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
469 PAGE_SIZE - offset); 471 btrfs_crc32c_final(crc, (u8 *)&crc);
470 btrfs_csum_final(crc, (u8 *)&crc);
471 io_ctl_unmap_page(io_ctl); 472 io_ctl_unmap_page(io_ctl);
472 tmp = page_address(io_ctl->pages[0]); 473 tmp = page_address(io_ctl->pages[0]);
473 tmp += index; 474 tmp += index;
@@ -493,9 +494,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
493 val = *tmp; 494 val = *tmp;
494 495
495 io_ctl_map_page(io_ctl, 0); 496 io_ctl_map_page(io_ctl, 0);
496 crc = btrfs_csum_data(io_ctl->orig + offset, crc, 497 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
497 PAGE_SIZE - offset); 498 btrfs_crc32c_final(crc, (u8 *)&crc);
498 btrfs_csum_final(crc, (u8 *)&crc);
499 if (val != crc) { 499 if (val != crc) {
500 btrfs_err_rl(io_ctl->fs_info, 500 btrfs_err_rl(io_ctl->fs_info,
501 "csum mismatch on free space cache"); 501 "csum mismatch on free space cache");
@@ -3166,8 +3166,8 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
3166 space_info->bytes_readonly += reserved_bytes; 3166 space_info->bytes_readonly += reserved_bytes;
3167 block_group->reserved -= reserved_bytes; 3167 block_group->reserved -= reserved_bytes;
3168 space_info->bytes_reserved -= reserved_bytes; 3168 space_info->bytes_reserved -= reserved_bytes;
3169 spin_unlock(&space_info->lock);
3170 spin_unlock(&block_group->lock); 3169 spin_unlock(&block_group->lock);
3170 spin_unlock(&space_info->lock);
3171 } 3171 }
3172 3172
3173 return ret; 3173 return ret;
@@ -3358,7 +3358,7 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
3358 3358
3359 if (cleanup) { 3359 if (cleanup) {
3360 mutex_lock(&fs_info->chunk_mutex); 3360 mutex_lock(&fs_info->chunk_mutex);
3361 em_tree = &fs_info->mapping_tree.map_tree; 3361 em_tree = &fs_info->mapping_tree;
3362 write_lock(&em_tree->lock); 3362 write_lock(&em_tree->lock);
3363 em = lookup_extent_mapping(em_tree, block_group->key.objectid, 3363 em = lookup_extent_mapping(em_tree, block_group->key.objectid,
3364 1); 3364 1);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ffca2abf13d0..2e8bb402050b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -11,6 +11,7 @@
11#include "free-space-cache.h" 11#include "free-space-cache.h"
12#include "inode-map.h" 12#include "inode-map.h"
13#include "transaction.h" 13#include "transaction.h"
14#include "delalloc-space.h"
14 15
15static int caching_kthread(void *data) 16static int caching_kthread(void *data)
16{ 17{
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a2aabdb85226..1af069a9a0c7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -47,6 +47,7 @@
47#include "props.h" 47#include "props.h"
48#include "qgroup.h" 48#include "qgroup.h"
49#include "dedupe.h" 49#include "dedupe.h"
50#include "delalloc-space.h"
50 51
51struct btrfs_iget_args { 52struct btrfs_iget_args {
52 struct btrfs_key *location; 53 struct btrfs_key *location;
@@ -1932,17 +1933,19 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
1932 u64 length = 0; 1933 u64 length = 0;
1933 u64 map_length; 1934 u64 map_length;
1934 int ret; 1935 int ret;
1936 struct btrfs_io_geometry geom;
1935 1937
1936 if (bio_flags & EXTENT_BIO_COMPRESSED) 1938 if (bio_flags & EXTENT_BIO_COMPRESSED)
1937 return 0; 1939 return 0;
1938 1940
1939 length = bio->bi_iter.bi_size; 1941 length = bio->bi_iter.bi_size;
1940 map_length = length; 1942 map_length = length;
1941 ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, 1943 ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length,
1942 NULL, 0); 1944 &geom);
1943 if (ret < 0) 1945 if (ret < 0)
1944 return ret; 1946 return ret;
1945 if (map_length < length + size) 1947
1948 if (geom.len < length + size)
1946 return 1; 1949 return 1;
1947 return 0; 1950 return 0;
1948} 1951}
@@ -3203,16 +3206,23 @@ static int __readpage_endio_check(struct inode *inode,
3203 int icsum, struct page *page, 3206 int icsum, struct page *page,
3204 int pgoff, u64 start, size_t len) 3207 int pgoff, u64 start, size_t len)
3205{ 3208{
3209 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3210 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3206 char *kaddr; 3211 char *kaddr;
3207 u32 csum_expected; 3212 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
3208 u32 csum = ~(u32)0; 3213 u8 *csum_expected;
3214 u8 csum[BTRFS_CSUM_SIZE];
3209 3215
3210 csum_expected = *(((u32 *)io_bio->csum) + icsum); 3216 csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size;
3211 3217
3212 kaddr = kmap_atomic(page); 3218 kaddr = kmap_atomic(page);
3213 csum = btrfs_csum_data(kaddr + pgoff, csum, len); 3219 shash->tfm = fs_info->csum_shash;
3214 btrfs_csum_final(csum, (u8 *)&csum); 3220
3215 if (csum != csum_expected) 3221 crypto_shash_init(shash);
3222 crypto_shash_update(shash, kaddr + pgoff, len);
3223 crypto_shash_final(shash, csum);
3224
3225 if (memcmp(csum, csum_expected, csum_size))
3216 goto zeroit; 3226 goto zeroit;
3217 3227
3218 kunmap_atomic(kaddr); 3228 kunmap_atomic(kaddr);
@@ -3286,6 +3296,28 @@ void btrfs_add_delayed_iput(struct inode *inode)
3286 wake_up_process(fs_info->cleaner_kthread); 3296 wake_up_process(fs_info->cleaner_kthread);
3287} 3297}
3288 3298
3299static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3300 struct btrfs_inode *inode)
3301{
3302 list_del_init(&inode->delayed_iput);
3303 spin_unlock(&fs_info->delayed_iput_lock);
3304 iput(&inode->vfs_inode);
3305 if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3306 wake_up(&fs_info->delayed_iputs_wait);
3307 spin_lock(&fs_info->delayed_iput_lock);
3308}
3309
3310static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3311 struct btrfs_inode *inode)
3312{
3313 if (!list_empty(&inode->delayed_iput)) {
3314 spin_lock(&fs_info->delayed_iput_lock);
3315 if (!list_empty(&inode->delayed_iput))
3316 run_delayed_iput_locked(fs_info, inode);
3317 spin_unlock(&fs_info->delayed_iput_lock);
3318 }
3319}
3320
3289void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) 3321void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3290{ 3322{
3291 3323
@@ -3295,12 +3327,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3295 3327
3296 inode = list_first_entry(&fs_info->delayed_iputs, 3328 inode = list_first_entry(&fs_info->delayed_iputs,
3297 struct btrfs_inode, delayed_iput); 3329 struct btrfs_inode, delayed_iput);
3298 list_del_init(&inode->delayed_iput); 3330 run_delayed_iput_locked(fs_info, inode);
3299 spin_unlock(&fs_info->delayed_iput_lock);
3300 iput(&inode->vfs_inode);
3301 if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3302 wake_up(&fs_info->delayed_iputs_wait);
3303 spin_lock(&fs_info->delayed_iput_lock);
3304 } 3331 }
3305 spin_unlock(&fs_info->delayed_iput_lock); 3332 spin_unlock(&fs_info->delayed_iput_lock);
3306} 3333}
@@ -3935,9 +3962,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3935 struct btrfs_fs_info *fs_info = root->fs_info; 3962 struct btrfs_fs_info *fs_info = root->fs_info;
3936 struct btrfs_path *path; 3963 struct btrfs_path *path;
3937 int ret = 0; 3964 int ret = 0;
3938 struct extent_buffer *leaf;
3939 struct btrfs_dir_item *di; 3965 struct btrfs_dir_item *di;
3940 struct btrfs_key key;
3941 u64 index; 3966 u64 index;
3942 u64 ino = btrfs_ino(inode); 3967 u64 ino = btrfs_ino(inode);
3943 u64 dir_ino = btrfs_ino(dir); 3968 u64 dir_ino = btrfs_ino(dir);
@@ -3955,8 +3980,6 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3955 ret = di ? PTR_ERR(di) : -ENOENT; 3980 ret = di ? PTR_ERR(di) : -ENOENT;
3956 goto err; 3981 goto err;
3957 } 3982 }
3958 leaf = path->nodes[0];
3959 btrfs_dir_item_key_to_cpu(leaf, di, &key);
3960 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3983 ret = btrfs_delete_one_dir_name(trans, root, path, di);
3961 if (ret) 3984 if (ret)
3962 goto err; 3985 goto err;
@@ -4009,6 +4032,17 @@ skip_backref:
4009 ret = 0; 4032 ret = 0;
4010 else if (ret) 4033 else if (ret)
4011 btrfs_abort_transaction(trans, ret); 4034 btrfs_abort_transaction(trans, ret);
4035
4036 /*
4037 * If we have a pending delayed iput we could end up with the final iput
4038 * being run in btrfs-cleaner context. If we have enough of these built
4039 * up we can end up burning a lot of time in btrfs-cleaner without any
4040 * way to throttle the unlinks. Since we're currently holding a ref on
4041 * the inode we can run the delayed iput here without any issues as the
4042 * final iput won't be done until after we drop the ref we're currently
4043 * holding.
4044 */
4045 btrfs_run_delayed_iput(fs_info, inode);
4012err: 4046err:
4013 btrfs_free_path(path); 4047 btrfs_free_path(path);
4014 if (ret) 4048 if (ret)
@@ -5008,21 +5042,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
5008 if (size <= hole_start) 5042 if (size <= hole_start)
5009 return 0; 5043 return 0;
5010 5044
5011 while (1) { 5045 btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start,
5012 struct btrfs_ordered_extent *ordered; 5046 block_end - 1, &cached_state);
5013
5014 lock_extent_bits(io_tree, hole_start, block_end - 1,
5015 &cached_state);
5016 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start,
5017 block_end - hole_start);
5018 if (!ordered)
5019 break;
5020 unlock_extent_cached(io_tree, hole_start, block_end - 1,
5021 &cached_state);
5022 btrfs_start_ordered_extent(inode, ordered, 1);
5023 btrfs_put_ordered_extent(ordered);
5024 }
5025
5026 cur_offset = hole_start; 5047 cur_offset = hole_start;
5027 while (1) { 5048 while (1) {
5028 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, 5049 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
@@ -8318,22 +8339,21 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
8318 struct bio *orig_bio = dip->orig_bio; 8339 struct bio *orig_bio = dip->orig_bio;
8319 u64 start_sector = orig_bio->bi_iter.bi_sector; 8340 u64 start_sector = orig_bio->bi_iter.bi_sector;
8320 u64 file_offset = dip->logical_offset; 8341 u64 file_offset = dip->logical_offset;
8321 u64 map_length;
8322 int async_submit = 0; 8342 int async_submit = 0;
8323 u64 submit_len; 8343 u64 submit_len;
8324 int clone_offset = 0; 8344 int clone_offset = 0;
8325 int clone_len; 8345 int clone_len;
8326 int ret; 8346 int ret;
8327 blk_status_t status; 8347 blk_status_t status;
8348 struct btrfs_io_geometry geom;
8328 8349
8329 map_length = orig_bio->bi_iter.bi_size; 8350 submit_len = orig_bio->bi_iter.bi_size;
8330 submit_len = map_length; 8351 ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
8331 ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, 8352 start_sector << 9, submit_len, &geom);
8332 &map_length, NULL, 0);
8333 if (ret) 8353 if (ret)
8334 return -EIO; 8354 return -EIO;
8335 8355
8336 if (map_length >= submit_len) { 8356 if (geom.len >= submit_len) {
8337 bio = orig_bio; 8357 bio = orig_bio;
8338 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; 8358 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
8339 goto submit; 8359 goto submit;
@@ -8346,10 +8366,10 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
8346 async_submit = 1; 8366 async_submit = 1;
8347 8367
8348 /* bio split */ 8368 /* bio split */
8349 ASSERT(map_length <= INT_MAX); 8369 ASSERT(geom.len <= INT_MAX);
8350 atomic_inc(&dip->pending_bios); 8370 atomic_inc(&dip->pending_bios);
8351 do { 8371 do {
8352 clone_len = min_t(int, submit_len, map_length); 8372 clone_len = min_t(int, submit_len, geom.len);
8353 8373
8354 /* 8374 /*
8355 * This will never fail as it's passing GPF_NOFS and 8375 * This will never fail as it's passing GPF_NOFS and
@@ -8386,9 +8406,8 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
8386 start_sector += clone_len >> 9; 8406 start_sector += clone_len >> 9;
8387 file_offset += clone_len; 8407 file_offset += clone_len;
8388 8408
8389 map_length = submit_len; 8409 ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
8390 ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), 8410 start_sector << 9, submit_len, &geom);
8391 start_sector << 9, &map_length, NULL, 0);
8392 if (ret) 8411 if (ret)
8393 goto out_err; 8412 goto out_err;
8394 } while (submit_len > 0); 8413 } while (submit_len > 0);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cfeff1b8dce0..818f7ec8bb0e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -43,6 +43,8 @@
43#include "qgroup.h" 43#include "qgroup.h"
44#include "tree-log.h" 44#include "tree-log.h"
45#include "compression.h" 45#include "compression.h"
46#include "space-info.h"
47#include "delalloc-space.h"
46 48
47#ifdef CONFIG_64BIT 49#ifdef CONFIG_64BIT
48/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI 50/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -3993,6 +3995,27 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
3993 if (!same_inode) 3995 if (!same_inode)
3994 inode_dio_wait(inode_out); 3996 inode_dio_wait(inode_out);
3995 3997
3998 /*
3999 * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
4000 *
4001 * Btrfs' back references do not have a block level granularity, they
4002 * work at the whole extent level.
4003 * NOCOW buffered write without data space reserved may not be able
4004 * to fall back to CoW due to lack of data space, thus could cause
4005 * data loss.
4006 *
4007 * Here we take a shortcut by flushing the whole inode, so that all
4008 * nocow write should reach disk as nocow before we increase the
4009 * reference of the extent. We could do better by only flushing NOCOW
4010 * data, but that needs extra accounting.
4011 *
4012 * Also we don't need to check ASYNC_EXTENT, as async extent will be
4013 * CoWed anyway, not affecting nocow part.
4014 */
4015 ret = filemap_flush(inode_in->i_mapping);
4016 if (ret < 0)
4017 return ret;
4018
3996 ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), 4019 ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
3997 wb_len); 4020 wb_len);
3998 if (ret < 0) 4021 if (ret < 0)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 2f6c3c7851ed..98fccce4208c 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -15,19 +15,19 @@
15#ifdef CONFIG_BTRFS_DEBUG 15#ifdef CONFIG_BTRFS_DEBUG
16static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) 16static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb)
17{ 17{
18 WARN_ON(atomic_read(&eb->spinning_writers)); 18 WARN_ON(eb->spinning_writers);
19 atomic_inc(&eb->spinning_writers); 19 eb->spinning_writers++;
20} 20}
21 21
22static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) 22static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb)
23{ 23{
24 WARN_ON(atomic_read(&eb->spinning_writers) != 1); 24 WARN_ON(eb->spinning_writers != 1);
25 atomic_dec(&eb->spinning_writers); 25 eb->spinning_writers--;
26} 26}
27 27
28static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) 28static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb)
29{ 29{
30 WARN_ON(atomic_read(&eb->spinning_writers)); 30 WARN_ON(eb->spinning_writers);
31} 31}
32 32
33static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) 33static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb)
@@ -58,17 +58,17 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
58 58
59static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) 59static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
60{ 60{
61 atomic_inc(&eb->write_locks); 61 eb->write_locks++;
62} 62}
63 63
64static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) 64static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
65{ 65{
66 atomic_dec(&eb->write_locks); 66 eb->write_locks--;
67} 67}
68 68
69void btrfs_assert_tree_locked(struct extent_buffer *eb) 69void btrfs_assert_tree_locked(struct extent_buffer *eb)
70{ 70{
71 BUG_ON(!atomic_read(&eb->write_locks)); 71 BUG_ON(!eb->write_locks);
72} 72}
73 73
74#else 74#else
@@ -111,10 +111,10 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
111 */ 111 */
112 if (eb->lock_nested && current->pid == eb->lock_owner) 112 if (eb->lock_nested && current->pid == eb->lock_owner)
113 return; 113 return;
114 if (atomic_read(&eb->blocking_writers) == 0) { 114 if (eb->blocking_writers == 0) {
115 btrfs_assert_spinning_writers_put(eb); 115 btrfs_assert_spinning_writers_put(eb);
116 btrfs_assert_tree_locked(eb); 116 btrfs_assert_tree_locked(eb);
117 atomic_inc(&eb->blocking_writers); 117 eb->blocking_writers++;
118 write_unlock(&eb->lock); 118 write_unlock(&eb->lock);
119 } 119 }
120} 120}
@@ -148,12 +148,11 @@ void btrfs_clear_lock_blocking_write(struct extent_buffer *eb)
148 */ 148 */
149 if (eb->lock_nested && current->pid == eb->lock_owner) 149 if (eb->lock_nested && current->pid == eb->lock_owner)
150 return; 150 return;
151 BUG_ON(atomic_read(&eb->blocking_writers) != 1);
152 write_lock(&eb->lock); 151 write_lock(&eb->lock);
152 BUG_ON(eb->blocking_writers != 1);
153 btrfs_assert_spinning_writers_get(eb); 153 btrfs_assert_spinning_writers_get(eb);
154 /* atomic_dec_and_test implies a barrier */ 154 if (--eb->blocking_writers == 0)
155 if (atomic_dec_and_test(&eb->blocking_writers)) 155 cond_wake_up(&eb->write_lock_wq);
156 cond_wake_up_nomb(&eb->write_lock_wq);
157} 156}
158 157
159/* 158/*
@@ -167,12 +166,10 @@ void btrfs_tree_read_lock(struct extent_buffer *eb)
167 if (trace_btrfs_tree_read_lock_enabled()) 166 if (trace_btrfs_tree_read_lock_enabled())
168 start_ns = ktime_get_ns(); 167 start_ns = ktime_get_ns();
169again: 168again:
170 BUG_ON(!atomic_read(&eb->blocking_writers) &&
171 current->pid == eb->lock_owner);
172
173 read_lock(&eb->lock); 169 read_lock(&eb->lock);
174 if (atomic_read(&eb->blocking_writers) && 170 BUG_ON(eb->blocking_writers == 0 &&
175 current->pid == eb->lock_owner) { 171 current->pid == eb->lock_owner);
172 if (eb->blocking_writers && current->pid == eb->lock_owner) {
176 /* 173 /*
177 * This extent is already write-locked by our thread. We allow 174 * This extent is already write-locked by our thread. We allow
178 * an additional read lock to be added because it's for the same 175 * an additional read lock to be added because it's for the same
@@ -185,10 +182,10 @@ again:
185 trace_btrfs_tree_read_lock(eb, start_ns); 182 trace_btrfs_tree_read_lock(eb, start_ns);
186 return; 183 return;
187 } 184 }
188 if (atomic_read(&eb->blocking_writers)) { 185 if (eb->blocking_writers) {
189 read_unlock(&eb->lock); 186 read_unlock(&eb->lock);
190 wait_event(eb->write_lock_wq, 187 wait_event(eb->write_lock_wq,
191 atomic_read(&eb->blocking_writers) == 0); 188 eb->blocking_writers == 0);
192 goto again; 189 goto again;
193 } 190 }
194 btrfs_assert_tree_read_locks_get(eb); 191 btrfs_assert_tree_read_locks_get(eb);
@@ -203,11 +200,11 @@ again:
203 */ 200 */
204int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) 201int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
205{ 202{
206 if (atomic_read(&eb->blocking_writers)) 203 if (eb->blocking_writers)
207 return 0; 204 return 0;
208 205
209 read_lock(&eb->lock); 206 read_lock(&eb->lock);
210 if (atomic_read(&eb->blocking_writers)) { 207 if (eb->blocking_writers) {
211 read_unlock(&eb->lock); 208 read_unlock(&eb->lock);
212 return 0; 209 return 0;
213 } 210 }
@@ -223,13 +220,13 @@ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
223 */ 220 */
224int btrfs_try_tree_read_lock(struct extent_buffer *eb) 221int btrfs_try_tree_read_lock(struct extent_buffer *eb)
225{ 222{
226 if (atomic_read(&eb->blocking_writers)) 223 if (eb->blocking_writers)
227 return 0; 224 return 0;
228 225
229 if (!read_trylock(&eb->lock)) 226 if (!read_trylock(&eb->lock))
230 return 0; 227 return 0;
231 228
232 if (atomic_read(&eb->blocking_writers)) { 229 if (eb->blocking_writers) {
233 read_unlock(&eb->lock); 230 read_unlock(&eb->lock);
234 return 0; 231 return 0;
235 } 232 }
@@ -245,13 +242,11 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
245 */ 242 */
246int btrfs_try_tree_write_lock(struct extent_buffer *eb) 243int btrfs_try_tree_write_lock(struct extent_buffer *eb)
247{ 244{
248 if (atomic_read(&eb->blocking_writers) || 245 if (eb->blocking_writers || atomic_read(&eb->blocking_readers))
249 atomic_read(&eb->blocking_readers))
250 return 0; 246 return 0;
251 247
252 write_lock(&eb->lock); 248 write_lock(&eb->lock);
253 if (atomic_read(&eb->blocking_writers) || 249 if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) {
254 atomic_read(&eb->blocking_readers)) {
255 write_unlock(&eb->lock); 250 write_unlock(&eb->lock);
256 return 0; 251 return 0;
257 } 252 }
@@ -322,10 +317,9 @@ void btrfs_tree_lock(struct extent_buffer *eb)
322 WARN_ON(eb->lock_owner == current->pid); 317 WARN_ON(eb->lock_owner == current->pid);
323again: 318again:
324 wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); 319 wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
325 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); 320 wait_event(eb->write_lock_wq, eb->blocking_writers == 0);
326 write_lock(&eb->lock); 321 write_lock(&eb->lock);
327 if (atomic_read(&eb->blocking_readers) || 322 if (atomic_read(&eb->blocking_readers) || eb->blocking_writers) {
328 atomic_read(&eb->blocking_writers)) {
329 write_unlock(&eb->lock); 323 write_unlock(&eb->lock);
330 goto again; 324 goto again;
331 } 325 }
@@ -340,7 +334,7 @@ again:
340 */ 334 */
341void btrfs_tree_unlock(struct extent_buffer *eb) 335void btrfs_tree_unlock(struct extent_buffer *eb)
342{ 336{
343 int blockers = atomic_read(&eb->blocking_writers); 337 int blockers = eb->blocking_writers;
344 338
345 BUG_ON(blockers > 1); 339 BUG_ON(blockers > 1);
346 340
@@ -351,7 +345,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
351 345
352 if (blockers) { 346 if (blockers) {
353 btrfs_assert_no_spinning_writers(eb); 347 btrfs_assert_no_spinning_writers(eb);
354 atomic_dec(&eb->blocking_writers); 348 eb->blocking_writers--;
355 /* Use the lighter barrier after atomic */ 349 /* Use the lighter barrier after atomic */
356 smp_mb__after_atomic(); 350 smp_mb__after_atomic();
357 cond_wake_up_nomb(&eb->write_lock_wq); 351 cond_wake_up_nomb(&eb->write_lock_wq);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 52889da69113..1744ba8b2754 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -13,6 +13,7 @@
13#include "extent_io.h" 13#include "extent_io.h"
14#include "disk-io.h" 14#include "disk-io.h"
15#include "compression.h" 15#include "compression.h"
16#include "delalloc-space.h"
16 17
17static struct kmem_cache *btrfs_ordered_extent_cache; 18static struct kmem_cache *btrfs_ordered_extent_cache;
18 19
@@ -924,14 +925,16 @@ out:
924 * be reclaimed before their checksum is actually put into the btree 925 * be reclaimed before their checksum is actually put into the btree
925 */ 926 */
926int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, 927int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
927 u32 *sum, int len) 928 u8 *sum, int len)
928{ 929{
930 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
929 struct btrfs_ordered_sum *ordered_sum; 931 struct btrfs_ordered_sum *ordered_sum;
930 struct btrfs_ordered_extent *ordered; 932 struct btrfs_ordered_extent *ordered;
931 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 933 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
932 unsigned long num_sectors; 934 unsigned long num_sectors;
933 unsigned long i; 935 unsigned long i;
934 u32 sectorsize = btrfs_inode_sectorsize(inode); 936 u32 sectorsize = btrfs_inode_sectorsize(inode);
937 const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
935 int index = 0; 938 int index = 0;
936 939
937 ordered = btrfs_lookup_ordered_extent(inode, offset); 940 ordered = btrfs_lookup_ordered_extent(inode, offset);
@@ -947,10 +950,10 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
947 num_sectors = ordered_sum->len >> 950 num_sectors = ordered_sum->len >>
948 inode->i_sb->s_blocksize_bits; 951 inode->i_sb->s_blocksize_bits;
949 num_sectors = min_t(int, len - index, num_sectors - i); 952 num_sectors = min_t(int, len - index, num_sectors - i);
950 memcpy(sum + index, ordered_sum->sums + i, 953 memcpy(sum + index, ordered_sum->sums + i * csum_size,
951 num_sectors); 954 num_sectors * csum_size);
952 955
953 index += (int)num_sectors; 956 index += (int)num_sectors * csum_size;
954 if (index == len) 957 if (index == len)
955 goto out; 958 goto out;
956 disk_bytenr += num_sectors * sectorsize; 959 disk_bytenr += num_sectors * sectorsize;
@@ -962,6 +965,51 @@ out:
962 return index; 965 return index;
963} 966}
964 967
968/*
969 * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
970 * ordered extents in it are run to completion.
971 *
972 * @tree: IO tree used for locking out other users of the range
973 * @inode: Inode whose ordered tree is to be searched
974 * @start: Beginning of range to flush
975 * @end: Last byte of range to lock
976 * @cached_state: If passed, will return the extent state responsible for the
977 * locked range. It's the caller's responsibility to free the cached state.
978 *
979 * This function always returns with the given range locked, ensuring after it's
980 * called no order extent can be pending.
981 */
982void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
983 struct btrfs_inode *inode, u64 start,
984 u64 end,
985 struct extent_state **cached_state)
986{
987 struct btrfs_ordered_extent *ordered;
988 struct extent_state *cachedp = NULL;
989
990 if (cached_state)
991 cachedp = *cached_state;
992
993 while (1) {
994 lock_extent_bits(tree, start, end, &cachedp);
995 ordered = btrfs_lookup_ordered_range(inode, start,
996 end - start + 1);
997 if (!ordered) {
998 /*
999 * If no external cached_state has been passed then
1000 * decrement the extra ref taken for cachedp since we
1001 * aren't exposing it outside of this function
1002 */
1003 if (!cached_state)
1004 refcount_dec(&cachedp->refs);
1005 break;
1006 }
1007 unlock_extent_cached(tree, start, end, &cachedp);
1008 btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
1009 btrfs_put_ordered_extent(ordered);
1010 }
1011}
1012
965int __init ordered_data_init(void) 1013int __init ordered_data_init(void)
966{ 1014{
967 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", 1015 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 4c5991c3de14..5204171ea962 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -23,7 +23,7 @@ struct btrfs_ordered_sum {
23 int len; 23 int len;
24 struct list_head list; 24 struct list_head list;
25 /* last field is a variable length array of csums */ 25 /* last field is a variable length array of csums */
26 u32 sums[]; 26 u8 sums[];
27}; 27};
28 28
29/* 29/*
@@ -183,11 +183,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
183int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 183int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
184 struct btrfs_ordered_extent *ordered); 184 struct btrfs_ordered_extent *ordered);
185int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, 185int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
186 u32 *sum, int len); 186 u8 *sum, int len);
187u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, 187u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
188 const u64 range_start, const u64 range_len); 188 const u64 range_start, const u64 range_len);
189u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, 189u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
190 const u64 range_start, const u64 range_len); 190 const u64 range_start, const u64 range_len);
191void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
192 struct btrfs_inode *inode, u64 start,
193 u64 end,
194 struct extent_state **cached_state);
191int __init ordered_data_init(void); 195int __init ordered_data_init(void);
192void __cold ordered_data_exit(void); 196void __cold ordered_data_exit(void);
193 197
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 1141ca5fae6a..9cb50577d982 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -153,11 +153,11 @@ static void print_eb_refs_lock(struct extent_buffer *eb)
153#ifdef CONFIG_BTRFS_DEBUG 153#ifdef CONFIG_BTRFS_DEBUG
154 btrfs_info(eb->fs_info, 154 btrfs_info(eb->fs_info,
155"refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u", 155"refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u",
156 atomic_read(&eb->refs), atomic_read(&eb->write_locks), 156 atomic_read(&eb->refs), eb->write_locks,
157 atomic_read(&eb->read_locks), 157 atomic_read(&eb->read_locks),
158 atomic_read(&eb->blocking_writers), 158 eb->blocking_writers,
159 atomic_read(&eb->blocking_readers), 159 atomic_read(&eb->blocking_readers),
160 atomic_read(&eb->spinning_writers), 160 eb->spinning_writers,
161 atomic_read(&eb->spinning_readers), 161 atomic_read(&eb->spinning_readers),
162 eb->lock_owner, current->pid); 162 eb->lock_owner, current->pid);
163#endif 163#endif
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index a9e2e66152ee..e0469816c678 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -257,11 +257,7 @@ static int prop_compression_validate(const char *value, size_t len)
257 if (!value) 257 if (!value)
258 return 0; 258 return 0;
259 259
260 if (!strncmp("lzo", value, 3)) 260 if (btrfs_compress_is_valid_type(value, len))
261 return 0;
262 else if (!strncmp("zlib", value, 4))
263 return 0;
264 else if (!strncmp("zstd", value, 4))
265 return 0; 261 return 0;
266 262
267 return -EINVAL; 263 return -EINVAL;
@@ -341,7 +337,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
341 for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { 337 for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
342 const struct prop_handler *h = &prop_handlers[i]; 338 const struct prop_handler *h = &prop_handlers[i];
343 const char *value; 339 const char *value;
344 u64 num_bytes; 340 u64 num_bytes = 0;
345 341
346 if (!h->inheritable) 342 if (!h->inheritable)
347 continue; 343 continue;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 3e6ffbbd8b0a..f8a3c1b0a15a 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2614,6 +2614,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
2614 int ret = 0; 2614 int ret = 0;
2615 int i; 2615 int i;
2616 u64 *i_qgroups; 2616 u64 *i_qgroups;
2617 bool committing = false;
2617 struct btrfs_fs_info *fs_info = trans->fs_info; 2618 struct btrfs_fs_info *fs_info = trans->fs_info;
2618 struct btrfs_root *quota_root; 2619 struct btrfs_root *quota_root;
2619 struct btrfs_qgroup *srcgroup; 2620 struct btrfs_qgroup *srcgroup;
@@ -2621,7 +2622,25 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
2621 u32 level_size = 0; 2622 u32 level_size = 0;
2622 u64 nums; 2623 u64 nums;
2623 2624
2624 mutex_lock(&fs_info->qgroup_ioctl_lock); 2625 /*
2626 * There are only two callers of this function.
2627 *
2628 * One in create_subvol() in the ioctl context, which needs to hold
2629 * the qgroup_ioctl_lock.
2630 *
2631 * The other one in create_pending_snapshot() where no other qgroup
2632 * code can modify the fs as they all need to either start a new trans
2633 * or hold a trans handler, thus we don't need to hold
2634 * qgroup_ioctl_lock.
2635 * This would avoid long and complex lock chain and make lockdep happy.
2636 */
2637 spin_lock(&fs_info->trans_lock);
2638 if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
2639 committing = true;
2640 spin_unlock(&fs_info->trans_lock);
2641
2642 if (!committing)
2643 mutex_lock(&fs_info->qgroup_ioctl_lock);
2625 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2644 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2626 goto out; 2645 goto out;
2627 2646
@@ -2785,7 +2804,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
2785unlock: 2804unlock:
2786 spin_unlock(&fs_info->qgroup_lock); 2805 spin_unlock(&fs_info->qgroup_lock);
2787out: 2806out:
2788 mutex_unlock(&fs_info->qgroup_ioctl_lock); 2807 if (!committing)
2808 mutex_unlock(&fs_info->qgroup_ioctl_lock);
2789 return ret; 2809 return ret;
2790} 2810}
2791 2811
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index f5d4c13a8dbc..2503485db859 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -7,7 +7,7 @@
7#ifndef BTRFS_RAID56_H 7#ifndef BTRFS_RAID56_H
8#define BTRFS_RAID56_H 8#define BTRFS_RAID56_H
9 9
10static inline int nr_parity_stripes(struct map_lookup *map) 10static inline int nr_parity_stripes(const struct map_lookup *map)
11{ 11{
12 if (map->type & BTRFS_BLOCK_GROUP_RAID5) 12 if (map->type & BTRFS_BLOCK_GROUP_RAID5)
13 return 1; 13 return 1;
@@ -17,7 +17,7 @@ static inline int nr_parity_stripes(struct map_lookup *map)
17 return 0; 17 return 0;
18} 18}
19 19
20static inline int nr_data_stripes(struct map_lookup *map) 20static inline int nr_data_stripes(const struct map_lookup *map)
21{ 21{
22 return map->num_stripes - nr_parity_stripes(map); 22 return map->num_stripes - nr_parity_stripes(map);
23} 23}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 22a3c69864fa..7f219851fa23 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -20,6 +20,7 @@
20#include "inode-map.h" 20#include "inode-map.h"
21#include "qgroup.h" 21#include "qgroup.h"
22#include "print-tree.h" 22#include "print-tree.h"
23#include "delalloc-space.h"
23 24
24/* 25/*
25 * backref_node, mapping_node and tree_block start with this 26 * backref_node, mapping_node and tree_block start with this
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 22124122728c..47733fb55df7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -9,6 +9,8 @@
9#include "transaction.h" 9#include "transaction.h"
10#include "disk-io.h" 10#include "disk-io.h"
11#include "print-tree.h" 11#include "print-tree.h"
12#include "qgroup.h"
13#include "space-info.h"
12 14
13/* 15/*
14 * Read a root item from the tree. In case we detect a root item smaller then 16 * Read a root item from the tree. In case we detect a root item smaller then
@@ -497,3 +499,57 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
497 btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec); 499 btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec);
498 spin_unlock(&root->root_item_lock); 500 spin_unlock(&root->root_item_lock);
499} 501}
502
503/*
504 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
505 * root: the root of the parent directory
506 * rsv: block reservation
507 * items: the number of items that we need do reservation
508 * use_global_rsv: allow fallback to the global block reservation
509 *
510 * This function is used to reserve the space for snapshot/subvolume
511 * creation and deletion. Those operations are different with the
512 * common file/directory operations, they change two fs/file trees
513 * and root tree, the number of items that the qgroup reserves is
514 * different with the free space reservation. So we can not use
515 * the space reservation mechanism in start_transaction().
516 */
517int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
518 struct btrfs_block_rsv *rsv, int items,
519 bool use_global_rsv)
520{
521 u64 qgroup_num_bytes = 0;
522 u64 num_bytes;
523 int ret;
524 struct btrfs_fs_info *fs_info = root->fs_info;
525 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
526
527 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
528 /* One for parent inode, two for dir entries */
529 qgroup_num_bytes = 3 * fs_info->nodesize;
530 ret = btrfs_qgroup_reserve_meta_prealloc(root,
531 qgroup_num_bytes, true);
532 if (ret)
533 return ret;
534 }
535
536 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
537 rsv->space_info = btrfs_find_space_info(fs_info,
538 BTRFS_BLOCK_GROUP_METADATA);
539 ret = btrfs_block_rsv_add(root, rsv, num_bytes,
540 BTRFS_RESERVE_FLUSH_ALL);
541
542 if (ret == -ENOSPC && use_global_rsv)
543 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
544
545 if (ret && qgroup_num_bytes)
546 btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
547
548 return ret;
549}
550
551void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
552 struct btrfs_block_rsv *rsv)
553{
554 btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
555}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f7b29f9db5e2..0c99cf9fb595 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -6,6 +6,7 @@
6#include <linux/blkdev.h> 6#include <linux/blkdev.h>
7#include <linux/ratelimit.h> 7#include <linux/ratelimit.h>
8#include <linux/sched/mm.h> 8#include <linux/sched/mm.h>
9#include <crypto/hash.h>
9#include "ctree.h" 10#include "ctree.h"
10#include "volumes.h" 11#include "volumes.h"
11#include "disk-io.h" 12#include "disk-io.h"
@@ -1787,11 +1788,12 @@ static int scrub_checksum(struct scrub_block *sblock)
1787static int scrub_checksum_data(struct scrub_block *sblock) 1788static int scrub_checksum_data(struct scrub_block *sblock)
1788{ 1789{
1789 struct scrub_ctx *sctx = sblock->sctx; 1790 struct scrub_ctx *sctx = sblock->sctx;
1791 struct btrfs_fs_info *fs_info = sctx->fs_info;
1792 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1790 u8 csum[BTRFS_CSUM_SIZE]; 1793 u8 csum[BTRFS_CSUM_SIZE];
1791 u8 *on_disk_csum; 1794 u8 *on_disk_csum;
1792 struct page *page; 1795 struct page *page;
1793 void *buffer; 1796 void *buffer;
1794 u32 crc = ~(u32)0;
1795 u64 len; 1797 u64 len;
1796 int index; 1798 int index;
1797 1799
@@ -1799,6 +1801,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1799 if (!sblock->pagev[0]->have_csum) 1801 if (!sblock->pagev[0]->have_csum)
1800 return 0; 1802 return 0;
1801 1803
1804 shash->tfm = fs_info->csum_shash;
1805 crypto_shash_init(shash);
1806
1802 on_disk_csum = sblock->pagev[0]->csum; 1807 on_disk_csum = sblock->pagev[0]->csum;
1803 page = sblock->pagev[0]->page; 1808 page = sblock->pagev[0]->page;
1804 buffer = kmap_atomic(page); 1809 buffer = kmap_atomic(page);
@@ -1808,7 +1813,7 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1808 for (;;) { 1813 for (;;) {
1809 u64 l = min_t(u64, len, PAGE_SIZE); 1814 u64 l = min_t(u64, len, PAGE_SIZE);
1810 1815
1811 crc = btrfs_csum_data(buffer, crc, l); 1816 crypto_shash_update(shash, buffer, l);
1812 kunmap_atomic(buffer); 1817 kunmap_atomic(buffer);
1813 len -= l; 1818 len -= l;
1814 if (len == 0) 1819 if (len == 0)
@@ -1820,7 +1825,7 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1820 buffer = kmap_atomic(page); 1825 buffer = kmap_atomic(page);
1821 } 1826 }
1822 1827
1823 btrfs_csum_final(crc, csum); 1828 crypto_shash_final(shash, csum);
1824 if (memcmp(csum, on_disk_csum, sctx->csum_size)) 1829 if (memcmp(csum, on_disk_csum, sctx->csum_size))
1825 sblock->checksum_error = 1; 1830 sblock->checksum_error = 1;
1826 1831
@@ -1832,16 +1837,19 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1832 struct scrub_ctx *sctx = sblock->sctx; 1837 struct scrub_ctx *sctx = sblock->sctx;
1833 struct btrfs_header *h; 1838 struct btrfs_header *h;
1834 struct btrfs_fs_info *fs_info = sctx->fs_info; 1839 struct btrfs_fs_info *fs_info = sctx->fs_info;
1840 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1835 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1841 u8 calculated_csum[BTRFS_CSUM_SIZE];
1836 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1842 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1837 struct page *page; 1843 struct page *page;
1838 void *mapped_buffer; 1844 void *mapped_buffer;
1839 u64 mapped_size; 1845 u64 mapped_size;
1840 void *p; 1846 void *p;
1841 u32 crc = ~(u32)0;
1842 u64 len; 1847 u64 len;
1843 int index; 1848 int index;
1844 1849
1850 shash->tfm = fs_info->csum_shash;
1851 crypto_shash_init(shash);
1852
1845 BUG_ON(sblock->page_count < 1); 1853 BUG_ON(sblock->page_count < 1);
1846 page = sblock->pagev[0]->page; 1854 page = sblock->pagev[0]->page;
1847 mapped_buffer = kmap_atomic(page); 1855 mapped_buffer = kmap_atomic(page);
@@ -1875,7 +1883,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1875 for (;;) { 1883 for (;;) {
1876 u64 l = min_t(u64, len, mapped_size); 1884 u64 l = min_t(u64, len, mapped_size);
1877 1885
1878 crc = btrfs_csum_data(p, crc, l); 1886 crypto_shash_update(shash, p, l);
1879 kunmap_atomic(mapped_buffer); 1887 kunmap_atomic(mapped_buffer);
1880 len -= l; 1888 len -= l;
1881 if (len == 0) 1889 if (len == 0)
@@ -1889,7 +1897,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1889 p = mapped_buffer; 1897 p = mapped_buffer;
1890 } 1898 }
1891 1899
1892 btrfs_csum_final(crc, calculated_csum); 1900 crypto_shash_final(shash, calculated_csum);
1893 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 1901 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1894 sblock->checksum_error = 1; 1902 sblock->checksum_error = 1;
1895 1903
@@ -1900,18 +1908,22 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1900{ 1908{
1901 struct btrfs_super_block *s; 1909 struct btrfs_super_block *s;
1902 struct scrub_ctx *sctx = sblock->sctx; 1910 struct scrub_ctx *sctx = sblock->sctx;
1911 struct btrfs_fs_info *fs_info = sctx->fs_info;
1912 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1903 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1913 u8 calculated_csum[BTRFS_CSUM_SIZE];
1904 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1914 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1905 struct page *page; 1915 struct page *page;
1906 void *mapped_buffer; 1916 void *mapped_buffer;
1907 u64 mapped_size; 1917 u64 mapped_size;
1908 void *p; 1918 void *p;
1909 u32 crc = ~(u32)0;
1910 int fail_gen = 0; 1919 int fail_gen = 0;
1911 int fail_cor = 0; 1920 int fail_cor = 0;
1912 u64 len; 1921 u64 len;
1913 int index; 1922 int index;
1914 1923
1924 shash->tfm = fs_info->csum_shash;
1925 crypto_shash_init(shash);
1926
1915 BUG_ON(sblock->page_count < 1); 1927 BUG_ON(sblock->page_count < 1);
1916 page = sblock->pagev[0]->page; 1928 page = sblock->pagev[0]->page;
1917 mapped_buffer = kmap_atomic(page); 1929 mapped_buffer = kmap_atomic(page);
@@ -1934,7 +1946,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1934 for (;;) { 1946 for (;;) {
1935 u64 l = min_t(u64, len, mapped_size); 1947 u64 l = min_t(u64, len, mapped_size);
1936 1948
1937 crc = btrfs_csum_data(p, crc, l); 1949 crypto_shash_update(shash, p, l);
1938 kunmap_atomic(mapped_buffer); 1950 kunmap_atomic(mapped_buffer);
1939 len -= l; 1951 len -= l;
1940 if (len == 0) 1952 if (len == 0)
@@ -1948,7 +1960,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1948 p = mapped_buffer; 1960 p = mapped_buffer;
1949 } 1961 }
1950 1962
1951 btrfs_csum_final(crc, calculated_csum); 1963 crypto_shash_final(shash, calculated_csum);
1952 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 1964 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1953 ++fail_cor; 1965 ++fail_cor;
1954 1966
@@ -2448,7 +2460,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2448 ASSERT(index < UINT_MAX); 2460 ASSERT(index < UINT_MAX);
2449 2461
2450 num_sectors = sum->len / sctx->fs_info->sectorsize; 2462 num_sectors = sum->len / sctx->fs_info->sectorsize;
2451 memcpy(csum, sum->sums + index, sctx->csum_size); 2463 memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size);
2452 if (index == num_sectors - 1) { 2464 if (index == num_sectors - 1) {
2453 list_del(&sum->list); 2465 list_del(&sum->list);
2454 kfree(sum); 2466 kfree(sum);
@@ -2660,18 +2672,18 @@ static int get_raid56_logic_offset(u64 physical, int num,
2660 u64 last_offset; 2672 u64 last_offset;
2661 u32 stripe_index; 2673 u32 stripe_index;
2662 u32 rot; 2674 u32 rot;
2675 const int data_stripes = nr_data_stripes(map);
2663 2676
2664 last_offset = (physical - map->stripes[num].physical) * 2677 last_offset = (physical - map->stripes[num].physical) * data_stripes;
2665 nr_data_stripes(map);
2666 if (stripe_start) 2678 if (stripe_start)
2667 *stripe_start = last_offset; 2679 *stripe_start = last_offset;
2668 2680
2669 *offset = last_offset; 2681 *offset = last_offset;
2670 for (i = 0; i < nr_data_stripes(map); i++) { 2682 for (i = 0; i < data_stripes; i++) {
2671 *offset = last_offset + i * map->stripe_len; 2683 *offset = last_offset + i * map->stripe_len;
2672 2684
2673 stripe_nr = div64_u64(*offset, map->stripe_len); 2685 stripe_nr = div64_u64(*offset, map->stripe_len);
2674 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); 2686 stripe_nr = div_u64(stripe_nr, data_stripes);
2675 2687
2676 /* Work out the disk rotation on this stripe-set */ 2688 /* Work out the disk rotation on this stripe-set */
2677 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); 2689 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
@@ -3079,7 +3091,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3079 offset = map->stripe_len * (num / map->sub_stripes); 3091 offset = map->stripe_len * (num / map->sub_stripes);
3080 increment = map->stripe_len * factor; 3092 increment = map->stripe_len * factor;
3081 mirror_num = num % map->sub_stripes + 1; 3093 mirror_num = num % map->sub_stripes + 1;
3082 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3094 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
3083 increment = map->stripe_len; 3095 increment = map->stripe_len;
3084 mirror_num = num % map->num_stripes + 1; 3096 mirror_num = num % map->num_stripes + 1;
3085 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3097 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
@@ -3410,15 +3422,15 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3410 struct btrfs_block_group_cache *cache) 3422 struct btrfs_block_group_cache *cache)
3411{ 3423{
3412 struct btrfs_fs_info *fs_info = sctx->fs_info; 3424 struct btrfs_fs_info *fs_info = sctx->fs_info;
3413 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 3425 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3414 struct map_lookup *map; 3426 struct map_lookup *map;
3415 struct extent_map *em; 3427 struct extent_map *em;
3416 int i; 3428 int i;
3417 int ret = 0; 3429 int ret = 0;
3418 3430
3419 read_lock(&map_tree->map_tree.lock); 3431 read_lock(&map_tree->lock);
3420 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 3432 em = lookup_extent_mapping(map_tree, chunk_offset, 1);
3421 read_unlock(&map_tree->map_tree.lock); 3433 read_unlock(&map_tree->lock);
3422 3434
3423 if (!em) { 3435 if (!em) {
3424 /* 3436 /*
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f7fe4770f0e5..69b59bf75882 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -686,7 +686,7 @@ static int send_cmd(struct send_ctx *sctx)
686 hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); 686 hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
687 hdr->crc = 0; 687 hdr->crc = 0;
688 688
689 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); 689 crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
690 hdr->crc = cpu_to_le32(crc); 690 hdr->crc = cpu_to_le32(crc);
691 691
692 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, 692 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -6929,9 +6929,23 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
6929 if (ret) 6929 if (ret)
6930 goto out; 6930 goto out;
6931 6931
6932 mutex_lock(&fs_info->balance_mutex);
6933 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
6934 mutex_unlock(&fs_info->balance_mutex);
6935 btrfs_warn_rl(fs_info,
6936 "cannot run send because a balance operation is in progress");
6937 ret = -EAGAIN;
6938 goto out;
6939 }
6940 fs_info->send_in_progress++;
6941 mutex_unlock(&fs_info->balance_mutex);
6942
6932 current->journal_info = BTRFS_SEND_TRANS_STUB; 6943 current->journal_info = BTRFS_SEND_TRANS_STUB;
6933 ret = send_subvol(sctx); 6944 ret = send_subvol(sctx);
6934 current->journal_info = NULL; 6945 current->journal_info = NULL;
6946 mutex_lock(&fs_info->balance_mutex);
6947 fs_info->send_in_progress--;
6948 mutex_unlock(&fs_info->balance_mutex);
6935 if (ret < 0) 6949 if (ret < 0)
6936 goto out; 6950 goto out;
6937 6951
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
new file mode 100644
index 000000000000..ab7b9ec4c240
--- /dev/null
+++ b/fs/btrfs/space-info.c
@@ -0,0 +1,1094 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#include "ctree.h"
4#include "space-info.h"
5#include "sysfs.h"
6#include "volumes.h"
7#include "free-space-cache.h"
8#include "ordered-data.h"
9#include "transaction.h"
10#include "math.h"
11
12u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
13 bool may_use_included)
14{
15 ASSERT(s_info);
16 return s_info->bytes_used + s_info->bytes_reserved +
17 s_info->bytes_pinned + s_info->bytes_readonly +
18 (may_use_included ? s_info->bytes_may_use : 0);
19}
20
21/*
22 * after adding space to the filesystem, we need to clear the full flags
23 * on all the space infos.
24 */
25void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
26{
27 struct list_head *head = &info->space_info;
28 struct btrfs_space_info *found;
29
30 rcu_read_lock();
31 list_for_each_entry_rcu(found, head, list)
32 found->full = 0;
33 rcu_read_unlock();
34}
35
36static const char *alloc_name(u64 flags)
37{
38 switch (flags) {
39 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
40 return "mixed";
41 case BTRFS_BLOCK_GROUP_METADATA:
42 return "metadata";
43 case BTRFS_BLOCK_GROUP_DATA:
44 return "data";
45 case BTRFS_BLOCK_GROUP_SYSTEM:
46 return "system";
47 default:
48 WARN_ON(1);
49 return "invalid-combination";
50 };
51}
52
53static int create_space_info(struct btrfs_fs_info *info, u64 flags)
54{
55
56 struct btrfs_space_info *space_info;
57 int i;
58 int ret;
59
60 space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
61 if (!space_info)
62 return -ENOMEM;
63
64 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
65 GFP_KERNEL);
66 if (ret) {
67 kfree(space_info);
68 return ret;
69 }
70
71 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
72 INIT_LIST_HEAD(&space_info->block_groups[i]);
73 init_rwsem(&space_info->groups_sem);
74 spin_lock_init(&space_info->lock);
75 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
76 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
77 init_waitqueue_head(&space_info->wait);
78 INIT_LIST_HEAD(&space_info->ro_bgs);
79 INIT_LIST_HEAD(&space_info->tickets);
80 INIT_LIST_HEAD(&space_info->priority_tickets);
81
82 ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
83 info->space_info_kobj, "%s",
84 alloc_name(space_info->flags));
85 if (ret) {
86 kobject_put(&space_info->kobj);
87 return ret;
88 }
89
90 list_add_rcu(&space_info->list, &info->space_info);
91 if (flags & BTRFS_BLOCK_GROUP_DATA)
92 info->data_sinfo = space_info;
93
94 return ret;
95}
96
97int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
98{
99 struct btrfs_super_block *disk_super;
100 u64 features;
101 u64 flags;
102 int mixed = 0;
103 int ret;
104
105 disk_super = fs_info->super_copy;
106 if (!btrfs_super_root(disk_super))
107 return -EINVAL;
108
109 features = btrfs_super_incompat_flags(disk_super);
110 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
111 mixed = 1;
112
113 flags = BTRFS_BLOCK_GROUP_SYSTEM;
114 ret = create_space_info(fs_info, flags);
115 if (ret)
116 goto out;
117
118 if (mixed) {
119 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
120 ret = create_space_info(fs_info, flags);
121 } else {
122 flags = BTRFS_BLOCK_GROUP_METADATA;
123 ret = create_space_info(fs_info, flags);
124 if (ret)
125 goto out;
126
127 flags = BTRFS_BLOCK_GROUP_DATA;
128 ret = create_space_info(fs_info, flags);
129 }
130out:
131 return ret;
132}
133
134void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
135 u64 total_bytes, u64 bytes_used,
136 u64 bytes_readonly,
137 struct btrfs_space_info **space_info)
138{
139 struct btrfs_space_info *found;
140 int factor;
141
142 factor = btrfs_bg_type_to_factor(flags);
143
144 found = btrfs_find_space_info(info, flags);
145 ASSERT(found);
146 spin_lock(&found->lock);
147 found->total_bytes += total_bytes;
148 found->disk_total += total_bytes * factor;
149 found->bytes_used += bytes_used;
150 found->disk_used += bytes_used * factor;
151 found->bytes_readonly += bytes_readonly;
152 if (total_bytes > 0)
153 found->full = 0;
154 btrfs_space_info_add_new_bytes(info, found,
155 total_bytes - bytes_used -
156 bytes_readonly);
157 spin_unlock(&found->lock);
158 *space_info = found;
159}
160
161struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
162 u64 flags)
163{
164 struct list_head *head = &info->space_info;
165 struct btrfs_space_info *found;
166
167 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
168
169 rcu_read_lock();
170 list_for_each_entry_rcu(found, head, list) {
171 if (found->flags & flags) {
172 rcu_read_unlock();
173 return found;
174 }
175 }
176 rcu_read_unlock();
177 return NULL;
178}
179
180static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
181{
182 return (global->size << 1);
183}
184
185static int can_overcommit(struct btrfs_fs_info *fs_info,
186 struct btrfs_space_info *space_info, u64 bytes,
187 enum btrfs_reserve_flush_enum flush,
188 bool system_chunk)
189{
190 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
191 u64 profile;
192 u64 space_size;
193 u64 avail;
194 u64 used;
195 int factor;
196
197 /* Don't overcommit when in mixed mode. */
198 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
199 return 0;
200
201 if (system_chunk)
202 profile = btrfs_system_alloc_profile(fs_info);
203 else
204 profile = btrfs_metadata_alloc_profile(fs_info);
205
206 used = btrfs_space_info_used(space_info, false);
207
208 /*
209 * We only want to allow over committing if we have lots of actual space
210 * free, but if we don't have enough space to handle the global reserve
211 * space then we could end up having a real enospc problem when trying
212 * to allocate a chunk or some other such important allocation.
213 */
214 spin_lock(&global_rsv->lock);
215 space_size = calc_global_rsv_need_space(global_rsv);
216 spin_unlock(&global_rsv->lock);
217 if (used + space_size >= space_info->total_bytes)
218 return 0;
219
220 used += space_info->bytes_may_use;
221
222 avail = atomic64_read(&fs_info->free_chunk_space);
223
224 /*
225 * If we have dup, raid1 or raid10 then only half of the free
226 * space is actually usable. For raid56, the space info used
227 * doesn't include the parity drive, so we don't have to
228 * change the math
229 */
230 factor = btrfs_bg_type_to_factor(profile);
231 avail = div_u64(avail, factor);
232
233 /*
234 * If we aren't flushing all things, let us overcommit up to
235 * 1/2th of the space. If we can flush, don't let us overcommit
236 * too much, let it overcommit up to 1/8 of the space.
237 */
238 if (flush == BTRFS_RESERVE_FLUSH_ALL)
239 avail >>= 3;
240 else
241 avail >>= 1;
242
243 if (used + bytes < space_info->total_bytes + avail)
244 return 1;
245 return 0;
246}
247
248/*
249 * This is for space we already have accounted in space_info->bytes_may_use, so
250 * basically when we're returning space from block_rsv's.
251 */
252void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
253 struct btrfs_space_info *space_info,
254 u64 num_bytes)
255{
256 struct reserve_ticket *ticket;
257 struct list_head *head;
258 u64 used;
259 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
260 bool check_overcommit = false;
261
262 spin_lock(&space_info->lock);
263 head = &space_info->priority_tickets;
264
265 /*
266 * If we are over our limit then we need to check and see if we can
267 * overcommit, and if we can't then we just need to free up our space
268 * and not satisfy any requests.
269 */
270 used = btrfs_space_info_used(space_info, true);
271 if (used - num_bytes >= space_info->total_bytes)
272 check_overcommit = true;
273again:
274 while (!list_empty(head) && num_bytes) {
275 ticket = list_first_entry(head, struct reserve_ticket,
276 list);
277 /*
278 * We use 0 bytes because this space is already reserved, so
279 * adding the ticket space would be a double count.
280 */
281 if (check_overcommit &&
282 !can_overcommit(fs_info, space_info, 0, flush, false))
283 break;
284 if (num_bytes >= ticket->bytes) {
285 list_del_init(&ticket->list);
286 num_bytes -= ticket->bytes;
287 ticket->bytes = 0;
288 space_info->tickets_id++;
289 wake_up(&ticket->wait);
290 } else {
291 ticket->bytes -= num_bytes;
292 num_bytes = 0;
293 }
294 }
295
296 if (num_bytes && head == &space_info->priority_tickets) {
297 head = &space_info->tickets;
298 flush = BTRFS_RESERVE_FLUSH_ALL;
299 goto again;
300 }
301 btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
302 trace_btrfs_space_reservation(fs_info, "space_info",
303 space_info->flags, num_bytes, 0);
304 spin_unlock(&space_info->lock);
305}
306
307/*
308 * This is for newly allocated space that isn't accounted in
309 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
310 * we use this helper.
311 */
312void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
313 struct btrfs_space_info *space_info,
314 u64 num_bytes)
315{
316 struct reserve_ticket *ticket;
317 struct list_head *head = &space_info->priority_tickets;
318
319again:
320 while (!list_empty(head) && num_bytes) {
321 ticket = list_first_entry(head, struct reserve_ticket,
322 list);
323 if (num_bytes >= ticket->bytes) {
324 trace_btrfs_space_reservation(fs_info, "space_info",
325 space_info->flags,
326 ticket->bytes, 1);
327 list_del_init(&ticket->list);
328 num_bytes -= ticket->bytes;
329 btrfs_space_info_update_bytes_may_use(fs_info,
330 space_info,
331 ticket->bytes);
332 ticket->bytes = 0;
333 space_info->tickets_id++;
334 wake_up(&ticket->wait);
335 } else {
336 trace_btrfs_space_reservation(fs_info, "space_info",
337 space_info->flags,
338 num_bytes, 1);
339 btrfs_space_info_update_bytes_may_use(fs_info,
340 space_info,
341 num_bytes);
342 ticket->bytes -= num_bytes;
343 num_bytes = 0;
344 }
345 }
346
347 if (num_bytes && head == &space_info->priority_tickets) {
348 head = &space_info->tickets;
349 goto again;
350 }
351}
352
353#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
354do { \
355 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
356 spin_lock(&__rsv->lock); \
357 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
358 __rsv->size, __rsv->reserved); \
359 spin_unlock(&__rsv->lock); \
360} while (0)
361
362void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
363 struct btrfs_space_info *info, u64 bytes,
364 int dump_block_groups)
365{
366 struct btrfs_block_group_cache *cache;
367 int index = 0;
368
369 spin_lock(&info->lock);
370 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
371 info->flags,
372 info->total_bytes - btrfs_space_info_used(info, true),
373 info->full ? "" : "not ");
374 btrfs_info(fs_info,
375 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
376 info->total_bytes, info->bytes_used, info->bytes_pinned,
377 info->bytes_reserved, info->bytes_may_use,
378 info->bytes_readonly);
379 spin_unlock(&info->lock);
380
381 DUMP_BLOCK_RSV(fs_info, global_block_rsv);
382 DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
383 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
384 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
385 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
386
387 if (!dump_block_groups)
388 return;
389
390 down_read(&info->groups_sem);
391again:
392 list_for_each_entry(cache, &info->block_groups[index], list) {
393 spin_lock(&cache->lock);
394 btrfs_info(fs_info,
395 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
396 cache->key.objectid, cache->key.offset,
397 btrfs_block_group_used(&cache->item), cache->pinned,
398 cache->reserved, cache->ro ? "[readonly]" : "");
399 btrfs_dump_free_space(cache, bytes);
400 spin_unlock(&cache->lock);
401 }
402 if (++index < BTRFS_NR_RAID_TYPES)
403 goto again;
404 up_read(&info->groups_sem);
405}
406
407static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
408 unsigned long nr_pages, int nr_items)
409{
410 struct super_block *sb = fs_info->sb;
411
412 if (down_read_trylock(&sb->s_umount)) {
413 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
414 up_read(&sb->s_umount);
415 } else {
416 /*
417 * We needn't worry the filesystem going from r/w to r/o though
418 * we don't acquire ->s_umount mutex, because the filesystem
419 * should guarantee the delalloc inodes list be empty after
420 * the filesystem is readonly(all dirty pages are written to
421 * the disk).
422 */
423 btrfs_start_delalloc_roots(fs_info, nr_items);
424 if (!current->journal_info)
425 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
426 }
427}
428
429static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
430 u64 to_reclaim)
431{
432 u64 bytes;
433 u64 nr;
434
435 bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
436 nr = div64_u64(to_reclaim, bytes);
437 if (!nr)
438 nr = 1;
439 return nr;
440}
441
442#define EXTENT_SIZE_PER_ITEM SZ_256K
443
444/*
445 * shrink metadata reservation for delalloc
446 */
447static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
448 u64 orig, bool wait_ordered)
449{
450 struct btrfs_space_info *space_info;
451 struct btrfs_trans_handle *trans;
452 u64 delalloc_bytes;
453 u64 dio_bytes;
454 u64 async_pages;
455 u64 items;
456 long time_left;
457 unsigned long nr_pages;
458 int loops;
459
460 /* Calc the number of the pages we need flush for space reservation */
461 items = calc_reclaim_items_nr(fs_info, to_reclaim);
462 to_reclaim = items * EXTENT_SIZE_PER_ITEM;
463
464 trans = (struct btrfs_trans_handle *)current->journal_info;
465 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
466
467 delalloc_bytes = percpu_counter_sum_positive(
468 &fs_info->delalloc_bytes);
469 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
470 if (delalloc_bytes == 0 && dio_bytes == 0) {
471 if (trans)
472 return;
473 if (wait_ordered)
474 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
475 return;
476 }
477
478 /*
479 * If we are doing more ordered than delalloc we need to just wait on
480 * ordered extents, otherwise we'll waste time trying to flush delalloc
481 * that likely won't give us the space back we need.
482 */
483 if (dio_bytes > delalloc_bytes)
484 wait_ordered = true;
485
486 loops = 0;
487 while ((delalloc_bytes || dio_bytes) && loops < 3) {
488 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
489
490 /*
491 * Triggers inode writeback for up to nr_pages. This will invoke
492 * ->writepages callback and trigger delalloc filling
493 * (btrfs_run_delalloc_range()).
494 */
495 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
496
497 /*
498 * We need to wait for the compressed pages to start before
499 * we continue.
500 */
501 async_pages = atomic_read(&fs_info->async_delalloc_pages);
502 if (!async_pages)
503 goto skip_async;
504
505 /*
506 * Calculate how many compressed pages we want to be written
507 * before we continue. I.e if there are more async pages than we
508 * require wait_event will wait until nr_pages are written.
509 */
510 if (async_pages <= nr_pages)
511 async_pages = 0;
512 else
513 async_pages -= nr_pages;
514
515 wait_event(fs_info->async_submit_wait,
516 atomic_read(&fs_info->async_delalloc_pages) <=
517 (int)async_pages);
518skip_async:
519 spin_lock(&space_info->lock);
520 if (list_empty(&space_info->tickets) &&
521 list_empty(&space_info->priority_tickets)) {
522 spin_unlock(&space_info->lock);
523 break;
524 }
525 spin_unlock(&space_info->lock);
526
527 loops++;
528 if (wait_ordered && !trans) {
529 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
530 } else {
531 time_left = schedule_timeout_killable(1);
532 if (time_left)
533 break;
534 }
535 delalloc_bytes = percpu_counter_sum_positive(
536 &fs_info->delalloc_bytes);
537 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
538 }
539}
540
541/**
542 * maybe_commit_transaction - possibly commit the transaction if its ok to
543 * @root - the root we're allocating for
544 * @bytes - the number of bytes we want to reserve
545 * @force - force the commit
546 *
547 * This will check to make sure that committing the transaction will actually
548 * get us somewhere and then commit the transaction if it does. Otherwise it
549 * will return -ENOSPC.
550 */
551static int may_commit_transaction(struct btrfs_fs_info *fs_info,
552 struct btrfs_space_info *space_info)
553{
554 struct reserve_ticket *ticket = NULL;
555 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
556 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
557 struct btrfs_trans_handle *trans;
558 u64 bytes_needed;
559 u64 reclaim_bytes = 0;
560
561 trans = (struct btrfs_trans_handle *)current->journal_info;
562 if (trans)
563 return -EAGAIN;
564
565 spin_lock(&space_info->lock);
566 if (!list_empty(&space_info->priority_tickets))
567 ticket = list_first_entry(&space_info->priority_tickets,
568 struct reserve_ticket, list);
569 else if (!list_empty(&space_info->tickets))
570 ticket = list_first_entry(&space_info->tickets,
571 struct reserve_ticket, list);
572 bytes_needed = (ticket) ? ticket->bytes : 0;
573 spin_unlock(&space_info->lock);
574
575 if (!bytes_needed)
576 return 0;
577
578 trans = btrfs_join_transaction(fs_info->extent_root);
579 if (IS_ERR(trans))
580 return PTR_ERR(trans);
581
582 /*
583 * See if there is enough pinned space to make this reservation, or if
584 * we have block groups that are going to be freed, allowing us to
585 * possibly do a chunk allocation the next loop through.
586 */
587 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
588 __percpu_counter_compare(&space_info->total_bytes_pinned,
589 bytes_needed,
590 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
591 goto commit;
592
593 /*
594 * See if there is some space in the delayed insertion reservation for
595 * this reservation.
596 */
597 if (space_info != delayed_rsv->space_info)
598 goto enospc;
599
600 spin_lock(&delayed_rsv->lock);
601 reclaim_bytes += delayed_rsv->reserved;
602 spin_unlock(&delayed_rsv->lock);
603
604 spin_lock(&delayed_refs_rsv->lock);
605 reclaim_bytes += delayed_refs_rsv->reserved;
606 spin_unlock(&delayed_refs_rsv->lock);
607 if (reclaim_bytes >= bytes_needed)
608 goto commit;
609 bytes_needed -= reclaim_bytes;
610
611 if (__percpu_counter_compare(&space_info->total_bytes_pinned,
612 bytes_needed,
613 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
614 goto enospc;
615
616commit:
617 return btrfs_commit_transaction(trans);
618enospc:
619 btrfs_end_transaction(trans);
620 return -ENOSPC;
621}
622
623/*
624 * Try to flush some data based on policy set by @state. This is only advisory
625 * and may fail for various reasons. The caller is supposed to examine the
626 * state of @space_info to detect the outcome.
627 */
628static void flush_space(struct btrfs_fs_info *fs_info,
629 struct btrfs_space_info *space_info, u64 num_bytes,
630 int state)
631{
632 struct btrfs_root *root = fs_info->extent_root;
633 struct btrfs_trans_handle *trans;
634 int nr;
635 int ret = 0;
636
637 switch (state) {
638 case FLUSH_DELAYED_ITEMS_NR:
639 case FLUSH_DELAYED_ITEMS:
640 if (state == FLUSH_DELAYED_ITEMS_NR)
641 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
642 else
643 nr = -1;
644
645 trans = btrfs_join_transaction(root);
646 if (IS_ERR(trans)) {
647 ret = PTR_ERR(trans);
648 break;
649 }
650 ret = btrfs_run_delayed_items_nr(trans, nr);
651 btrfs_end_transaction(trans);
652 break;
653 case FLUSH_DELALLOC:
654 case FLUSH_DELALLOC_WAIT:
655 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
656 state == FLUSH_DELALLOC_WAIT);
657 break;
658 case FLUSH_DELAYED_REFS_NR:
659 case FLUSH_DELAYED_REFS:
660 trans = btrfs_join_transaction(root);
661 if (IS_ERR(trans)) {
662 ret = PTR_ERR(trans);
663 break;
664 }
665 if (state == FLUSH_DELAYED_REFS_NR)
666 nr = calc_reclaim_items_nr(fs_info, num_bytes);
667 else
668 nr = 0;
669 btrfs_run_delayed_refs(trans, nr);
670 btrfs_end_transaction(trans);
671 break;
672 case ALLOC_CHUNK:
673 case ALLOC_CHUNK_FORCE:
674 trans = btrfs_join_transaction(root);
675 if (IS_ERR(trans)) {
676 ret = PTR_ERR(trans);
677 break;
678 }
679 ret = btrfs_chunk_alloc(trans,
680 btrfs_metadata_alloc_profile(fs_info),
681 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
682 CHUNK_ALLOC_FORCE);
683 btrfs_end_transaction(trans);
684 if (ret > 0 || ret == -ENOSPC)
685 ret = 0;
686 break;
687 case COMMIT_TRANS:
688 /*
689 * If we have pending delayed iputs then we could free up a
690 * bunch of pinned space, so make sure we run the iputs before
691 * we do our pinned bytes check below.
692 */
693 btrfs_run_delayed_iputs(fs_info);
694 btrfs_wait_on_delayed_iputs(fs_info);
695
696 ret = may_commit_transaction(fs_info, space_info);
697 break;
698 default:
699 ret = -ENOSPC;
700 break;
701 }
702
703 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
704 ret);
705 return;
706}
707
708static inline u64
709btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
710 struct btrfs_space_info *space_info,
711 bool system_chunk)
712{
713 struct reserve_ticket *ticket;
714 u64 used;
715 u64 expected;
716 u64 to_reclaim = 0;
717
718 list_for_each_entry(ticket, &space_info->tickets, list)
719 to_reclaim += ticket->bytes;
720 list_for_each_entry(ticket, &space_info->priority_tickets, list)
721 to_reclaim += ticket->bytes;
722 if (to_reclaim)
723 return to_reclaim;
724
725 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
726 if (can_overcommit(fs_info, space_info, to_reclaim,
727 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
728 return 0;
729
730 used = btrfs_space_info_used(space_info, true);
731
732 if (can_overcommit(fs_info, space_info, SZ_1M,
733 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
734 expected = div_factor_fine(space_info->total_bytes, 95);
735 else
736 expected = div_factor_fine(space_info->total_bytes, 90);
737
738 if (used > expected)
739 to_reclaim = used - expected;
740 else
741 to_reclaim = 0;
742 to_reclaim = min(to_reclaim, space_info->bytes_may_use +
743 space_info->bytes_reserved);
744 return to_reclaim;
745}
746
747static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
748 struct btrfs_space_info *space_info,
749 u64 used, bool system_chunk)
750{
751 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
752
753 /* If we're just plain full then async reclaim just slows us down. */
754 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
755 return 0;
756
757 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
758 system_chunk))
759 return 0;
760
761 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
762 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
763}
764
765static bool wake_all_tickets(struct list_head *head)
766{
767 struct reserve_ticket *ticket;
768
769 while (!list_empty(head)) {
770 ticket = list_first_entry(head, struct reserve_ticket, list);
771 list_del_init(&ticket->list);
772 ticket->error = -ENOSPC;
773 wake_up(&ticket->wait);
774 if (ticket->bytes != ticket->orig_bytes)
775 return true;
776 }
777 return false;
778}
779
780/*
781 * This is for normal flushers, we can wait all goddamned day if we want to. We
782 * will loop and continuously try to flush as long as we are making progress.
783 * We count progress as clearing off tickets each time we have to loop.
784 */
785static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
786{
787 struct btrfs_fs_info *fs_info;
788 struct btrfs_space_info *space_info;
789 u64 to_reclaim;
790 int flush_state;
791 int commit_cycles = 0;
792 u64 last_tickets_id;
793
794 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
795 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
796
797 spin_lock(&space_info->lock);
798 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
799 false);
800 if (!to_reclaim) {
801 space_info->flush = 0;
802 spin_unlock(&space_info->lock);
803 return;
804 }
805 last_tickets_id = space_info->tickets_id;
806 spin_unlock(&space_info->lock);
807
808 flush_state = FLUSH_DELAYED_ITEMS_NR;
809 do {
810 flush_space(fs_info, space_info, to_reclaim, flush_state);
811 spin_lock(&space_info->lock);
812 if (list_empty(&space_info->tickets)) {
813 space_info->flush = 0;
814 spin_unlock(&space_info->lock);
815 return;
816 }
817 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
818 space_info,
819 false);
820 if (last_tickets_id == space_info->tickets_id) {
821 flush_state++;
822 } else {
823 last_tickets_id = space_info->tickets_id;
824 flush_state = FLUSH_DELAYED_ITEMS_NR;
825 if (commit_cycles)
826 commit_cycles--;
827 }
828
829 /*
830 * We don't want to force a chunk allocation until we've tried
831 * pretty hard to reclaim space. Think of the case where we
832 * freed up a bunch of space and so have a lot of pinned space
833 * to reclaim. We would rather use that than possibly create a
834 * underutilized metadata chunk. So if this is our first run
835 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
836 * commit the transaction. If nothing has changed the next go
837 * around then we can force a chunk allocation.
838 */
839 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
840 flush_state++;
841
842 if (flush_state > COMMIT_TRANS) {
843 commit_cycles++;
844 if (commit_cycles > 2) {
845 if (wake_all_tickets(&space_info->tickets)) {
846 flush_state = FLUSH_DELAYED_ITEMS_NR;
847 commit_cycles--;
848 } else {
849 space_info->flush = 0;
850 }
851 } else {
852 flush_state = FLUSH_DELAYED_ITEMS_NR;
853 }
854 }
855 spin_unlock(&space_info->lock);
856 } while (flush_state <= COMMIT_TRANS);
857}
858
859void btrfs_init_async_reclaim_work(struct work_struct *work)
860{
861 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
862}
863
864static const enum btrfs_flush_state priority_flush_states[] = {
865 FLUSH_DELAYED_ITEMS_NR,
866 FLUSH_DELAYED_ITEMS,
867 ALLOC_CHUNK,
868};
869
870static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
871 struct btrfs_space_info *space_info,
872 struct reserve_ticket *ticket)
873{
874 u64 to_reclaim;
875 int flush_state;
876
877 spin_lock(&space_info->lock);
878 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
879 false);
880 if (!to_reclaim) {
881 spin_unlock(&space_info->lock);
882 return;
883 }
884 spin_unlock(&space_info->lock);
885
886 flush_state = 0;
887 do {
888 flush_space(fs_info, space_info, to_reclaim,
889 priority_flush_states[flush_state]);
890 flush_state++;
891 spin_lock(&space_info->lock);
892 if (ticket->bytes == 0) {
893 spin_unlock(&space_info->lock);
894 return;
895 }
896 spin_unlock(&space_info->lock);
897 } while (flush_state < ARRAY_SIZE(priority_flush_states));
898}
899
900static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
901 struct btrfs_space_info *space_info,
902 struct reserve_ticket *ticket)
903
904{
905 DEFINE_WAIT(wait);
906 u64 reclaim_bytes = 0;
907 int ret = 0;
908
909 spin_lock(&space_info->lock);
910 while (ticket->bytes > 0 && ticket->error == 0) {
911 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
912 if (ret) {
913 ret = -EINTR;
914 break;
915 }
916 spin_unlock(&space_info->lock);
917
918 schedule();
919
920 finish_wait(&ticket->wait, &wait);
921 spin_lock(&space_info->lock);
922 }
923 if (!ret)
924 ret = ticket->error;
925 if (!list_empty(&ticket->list))
926 list_del_init(&ticket->list);
927 if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
928 reclaim_bytes = ticket->orig_bytes - ticket->bytes;
929 spin_unlock(&space_info->lock);
930
931 if (reclaim_bytes)
932 btrfs_space_info_add_old_bytes(fs_info, space_info,
933 reclaim_bytes);
934 return ret;
935}
936
937/**
938 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
939 * @root - the root we're allocating for
940 * @space_info - the space info we want to allocate from
941 * @orig_bytes - the number of bytes we want
942 * @flush - whether or not we can flush to make our reservation
943 *
944 * This will reserve orig_bytes number of bytes from the space info associated
945 * with the block_rsv. If there is not enough space it will make an attempt to
946 * flush out space to make room. It will do this by flushing delalloc if
947 * possible or committing the transaction. If flush is 0 then no attempts to
948 * regain reservations will be made and this will fail if there is not enough
949 * space already.
950 */
951static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
952 struct btrfs_space_info *space_info,
953 u64 orig_bytes,
954 enum btrfs_reserve_flush_enum flush,
955 bool system_chunk)
956{
957 struct reserve_ticket ticket;
958 u64 used;
959 u64 reclaim_bytes = 0;
960 int ret = 0;
961
962 ASSERT(orig_bytes);
963 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
964
965 spin_lock(&space_info->lock);
966 ret = -ENOSPC;
967 used = btrfs_space_info_used(space_info, true);
968
969 /*
970 * Carry on if we have enough space (short-circuit) OR call
971 * can_overcommit() to ensure we can overcommit to continue.
972 */
973 if ((used + orig_bytes <= space_info->total_bytes) ||
974 can_overcommit(fs_info, space_info, orig_bytes, flush,
975 system_chunk)) {
976 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
977 orig_bytes);
978 trace_btrfs_space_reservation(fs_info, "space_info",
979 space_info->flags, orig_bytes, 1);
980 ret = 0;
981 }
982
983 /*
984 * If we couldn't make a reservation then setup our reservation ticket
985 * and kick the async worker if it's not already running.
986 *
987 * If we are a priority flusher then we just need to add our ticket to
988 * the list and we will do our own flushing further down.
989 */
990 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
991 ticket.orig_bytes = orig_bytes;
992 ticket.bytes = orig_bytes;
993 ticket.error = 0;
994 init_waitqueue_head(&ticket.wait);
995 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
996 list_add_tail(&ticket.list, &space_info->tickets);
997 if (!space_info->flush) {
998 space_info->flush = 1;
999 trace_btrfs_trigger_flush(fs_info,
1000 space_info->flags,
1001 orig_bytes, flush,
1002 "enospc");
1003 queue_work(system_unbound_wq,
1004 &fs_info->async_reclaim_work);
1005 }
1006 } else {
1007 list_add_tail(&ticket.list,
1008 &space_info->priority_tickets);
1009 }
1010 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1011 used += orig_bytes;
1012 /*
1013 * We will do the space reservation dance during log replay,
1014 * which means we won't have fs_info->fs_root set, so don't do
1015 * the async reclaim as we will panic.
1016 */
1017 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1018 need_do_async_reclaim(fs_info, space_info,
1019 used, system_chunk) &&
1020 !work_busy(&fs_info->async_reclaim_work)) {
1021 trace_btrfs_trigger_flush(fs_info, space_info->flags,
1022 orig_bytes, flush, "preempt");
1023 queue_work(system_unbound_wq,
1024 &fs_info->async_reclaim_work);
1025 }
1026 }
1027 spin_unlock(&space_info->lock);
1028 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1029 return ret;
1030
1031 if (flush == BTRFS_RESERVE_FLUSH_ALL)
1032 return wait_reserve_ticket(fs_info, space_info, &ticket);
1033
1034 ret = 0;
1035 priority_reclaim_metadata_space(fs_info, space_info, &ticket);
1036 spin_lock(&space_info->lock);
1037 if (ticket.bytes) {
1038 if (ticket.bytes < orig_bytes)
1039 reclaim_bytes = orig_bytes - ticket.bytes;
1040 list_del_init(&ticket.list);
1041 ret = -ENOSPC;
1042 }
1043 spin_unlock(&space_info->lock);
1044
1045 if (reclaim_bytes)
1046 btrfs_space_info_add_old_bytes(fs_info, space_info,
1047 reclaim_bytes);
1048 ASSERT(list_empty(&ticket.list));
1049 return ret;
1050}
1051
1052/**
1053 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1054 * @root - the root we're allocating for
1055 * @block_rsv - the block_rsv we're allocating for
1056 * @orig_bytes - the number of bytes we want
1057 * @flush - whether or not we can flush to make our reservation
1058 *
1059 * This will reserve orig_bytes number of bytes from the space info associated
1060 * with the block_rsv. If there is not enough space it will make an attempt to
1061 * flush out space to make room. It will do this by flushing delalloc if
1062 * possible or committing the transaction. If flush is 0 then no attempts to
1063 * regain reservations will be made and this will fail if there is not enough
1064 * space already.
1065 */
1066int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1067 struct btrfs_block_rsv *block_rsv,
1068 u64 orig_bytes,
1069 enum btrfs_reserve_flush_enum flush)
1070{
1071 struct btrfs_fs_info *fs_info = root->fs_info;
1072 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1073 int ret;
1074 bool system_chunk = (root == fs_info->chunk_root);
1075
1076 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
1077 orig_bytes, flush, system_chunk);
1078 if (ret == -ENOSPC &&
1079 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1080 if (block_rsv != global_rsv &&
1081 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1082 ret = 0;
1083 }
1084 if (ret == -ENOSPC) {
1085 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1086 block_rsv->space_info->flags,
1087 orig_bytes, 1);
1088
1089 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1090 btrfs_dump_space_info(fs_info, block_rsv->space_info,
1091 orig_bytes, 0);
1092 }
1093 return ret;
1094}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
new file mode 100644
index 000000000000..c2b54b8e1a14
--- /dev/null
+++ b/fs/btrfs/space-info.h
@@ -0,0 +1,133 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2
3#ifndef BTRFS_SPACE_INFO_H
4#define BTRFS_SPACE_INFO_H
5
6struct btrfs_space_info {
7 spinlock_t lock;
8
9 u64 total_bytes; /* total bytes in the space,
10 this doesn't take mirrors into account */
11 u64 bytes_used; /* total bytes used,
12 this doesn't take mirrors into account */
13 u64 bytes_pinned; /* total bytes pinned, will be freed when the
14 transaction finishes */
15 u64 bytes_reserved; /* total bytes the allocator has reserved for
16 current allocations */
17 u64 bytes_may_use; /* number of bytes that may be used for
18 delalloc/allocations */
19 u64 bytes_readonly; /* total bytes that are read only */
20
21 u64 max_extent_size; /* This will hold the maximum extent size of
22 the space info if we had an ENOSPC in the
23 allocator. */
24
25 unsigned int full:1; /* indicates that we cannot allocate any more
26 chunks for this space */
27 unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
28
29 unsigned int flush:1; /* set if we are trying to make space */
30
31 unsigned int force_alloc; /* set if we need to force a chunk
32 alloc for this space */
33
34 u64 disk_used; /* total bytes used on disk */
35 u64 disk_total; /* total bytes on disk, takes mirrors into
36 account */
37
38 u64 flags;
39
40 /*
41 * bytes_pinned is kept in line with what is actually pinned, as in
42 * we've called update_block_group and dropped the bytes_used counter
43 * and increased the bytes_pinned counter. However this means that
44 * bytes_pinned does not reflect the bytes that will be pinned once the
45 * delayed refs are flushed, so this counter is inc'ed every time we
46 * call btrfs_free_extent so it is a realtime count of what will be
47 * freed once the transaction is committed. It will be zeroed every
48 * time the transaction commits.
49 */
50 struct percpu_counter total_bytes_pinned;
51
52 struct list_head list;
53 /* Protected by the spinlock 'lock'. */
54 struct list_head ro_bgs;
55 struct list_head priority_tickets;
56 struct list_head tickets;
57 /*
58 * tickets_id just indicates the next ticket will be handled, so note
59 * it's not stored per ticket.
60 */
61 u64 tickets_id;
62
63 struct rw_semaphore groups_sem;
64 /* for block groups in our same type */
65 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
66 wait_queue_head_t wait;
67
68 struct kobject kobj;
69 struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
70};
71
72struct reserve_ticket {
73 u64 orig_bytes;
74 u64 bytes;
75 int error;
76 struct list_head list;
77 wait_queue_head_t wait;
78};
79
80static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
81{
82 return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
83 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
84}
85
86/*
87 *
88 * Declare a helper function to detect underflow of various space info members
89 */
90#define DECLARE_SPACE_INFO_UPDATE(name) \
91static inline void \
92btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \
93 struct btrfs_space_info *sinfo, \
94 s64 bytes) \
95{ \
96 lockdep_assert_held(&sinfo->lock); \
97 trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \
98 if (bytes < 0 && sinfo->name < -bytes) { \
99 WARN_ON(1); \
100 sinfo->name = 0; \
101 return; \
102 } \
103 sinfo->name += bytes; \
104}
105
106DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
107DECLARE_SPACE_INFO_UPDATE(bytes_pinned);
108
109void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
110 struct btrfs_space_info *space_info,
111 u64 num_bytes);
112void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
113 struct btrfs_space_info *space_info,
114 u64 num_bytes);
115int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
116void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
117 u64 total_bytes, u64 bytes_used,
118 u64 bytes_readonly,
119 struct btrfs_space_info **space_info);
120struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
121 u64 flags);
122u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
123 bool may_use_included);
124void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
125void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
126 struct btrfs_space_info *info, u64 bytes,
127 int dump_block_groups);
128int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
129 struct btrfs_block_rsv *block_rsv,
130 u64 orig_bytes,
131 enum btrfs_reserve_flush_enum flush);
132
133#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0645ec428b4f..78de9d5d80c6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -42,6 +42,7 @@
42#include "dev-replace.h" 42#include "dev-replace.h"
43#include "free-space-cache.h" 43#include "free-space-cache.h"
44#include "backref.h" 44#include "backref.h"
45#include "space-info.h"
45#include "tests/btrfs-tests.h" 46#include "tests/btrfs-tests.h"
46 47
47#include "qgroup.h" 48#include "qgroup.h"
@@ -1553,6 +1554,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
1553 } else { 1554 } else {
1554 snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); 1555 snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
1555 btrfs_sb(s)->bdev_holder = fs_type; 1556 btrfs_sb(s)->bdev_holder = fs_type;
1557 if (!strstr(crc32c_impl(), "generic"))
1558 set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
1556 error = btrfs_fill_super(s, fs_devices, data); 1559 error = btrfs_fill_super(s, fs_devices, data);
1557 } 1560 }
1558 if (!error) 1561 if (!error)
@@ -1601,14 +1604,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1601{ 1604{
1602 struct vfsmount *mnt_root; 1605 struct vfsmount *mnt_root;
1603 struct dentry *root; 1606 struct dentry *root;
1604 fmode_t mode = FMODE_READ;
1605 char *subvol_name = NULL; 1607 char *subvol_name = NULL;
1606 u64 subvol_objectid = 0; 1608 u64 subvol_objectid = 0;
1607 int error = 0; 1609 int error = 0;
1608 1610
1609 if (!(flags & SB_RDONLY))
1610 mode |= FMODE_WRITE;
1611
1612 error = btrfs_parse_subvol_options(data, &subvol_name, 1611 error = btrfs_parse_subvol_options(data, &subvol_name,
1613 &subvol_objectid); 1612 &subvol_objectid);
1614 if (error) { 1613 if (error) {
@@ -1904,8 +1903,9 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
1904 u64 type; 1903 u64 type;
1905 u64 avail_space; 1904 u64 avail_space;
1906 u64 min_stripe_size; 1905 u64 min_stripe_size;
1907 int min_stripes = 1, num_stripes = 1; 1906 int min_stripes, num_stripes = 1;
1908 int i = 0, nr_devices; 1907 int i = 0, nr_devices;
1908 const struct btrfs_raid_attr *rattr;
1909 1909
1910 /* 1910 /*
1911 * We aren't under the device list lock, so this is racy-ish, but good 1911 * We aren't under the device list lock, so this is racy-ish, but good
@@ -1929,21 +1929,18 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
1929 1929
1930 /* calc min stripe number for data space allocation */ 1930 /* calc min stripe number for data space allocation */
1931 type = btrfs_data_alloc_profile(fs_info); 1931 type = btrfs_data_alloc_profile(fs_info);
1932 if (type & BTRFS_BLOCK_GROUP_RAID0) { 1932 rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)];
1933 min_stripes = 2; 1933 min_stripes = rattr->devs_min;
1934
1935 if (type & BTRFS_BLOCK_GROUP_RAID0)
1934 num_stripes = nr_devices; 1936 num_stripes = nr_devices;
1935 } else if (type & BTRFS_BLOCK_GROUP_RAID1) { 1937 else if (type & BTRFS_BLOCK_GROUP_RAID1)
1936 min_stripes = 2;
1937 num_stripes = 2; 1938 num_stripes = 2;
1938 } else if (type & BTRFS_BLOCK_GROUP_RAID10) { 1939 else if (type & BTRFS_BLOCK_GROUP_RAID10)
1939 min_stripes = 4;
1940 num_stripes = 4; 1940 num_stripes = 4;
1941 }
1942 1941
1943 if (type & BTRFS_BLOCK_GROUP_DUP) 1942 /* Adjust for more than 1 stripe per device */
1944 min_stripe_size = 2 * BTRFS_STRIPE_LEN; 1943 min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN;
1945 else
1946 min_stripe_size = BTRFS_STRIPE_LEN;
1947 1944
1948 rcu_read_lock(); 1945 rcu_read_lock();
1949 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 1946 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
@@ -2466,3 +2463,4 @@ late_initcall(init_btrfs_fs);
2466module_exit(exit_btrfs_fs) 2463module_exit(exit_btrfs_fs)
2467 2464
2468MODULE_LICENSE("GPL"); 2465MODULE_LICENSE("GPL");
2466MODULE_SOFTDEP("pre: crc32c");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c1dfc97893ba..9539f8143b7a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -16,6 +16,7 @@
16#include "transaction.h" 16#include "transaction.h"
17#include "sysfs.h" 17#include "sysfs.h"
18#include "volumes.h" 18#include "volumes.h"
19#include "space-info.h"
19 20
20static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); 21static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
21static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); 22static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 7bf4d5734dbe..1bf6b5a79191 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -10,6 +10,7 @@
10#include "btrfs-tests.h" 10#include "btrfs-tests.h"
11#include "../ctree.h" 11#include "../ctree.h"
12#include "../extent_io.h" 12#include "../extent_io.h"
13#include "../btrfs_inode.h"
13 14
14#define PROCESS_UNLOCK (1 << 0) 15#define PROCESS_UNLOCK (1 << 0)
15#define PROCESS_RELEASE (1 << 1) 16#define PROCESS_RELEASE (1 << 1)
@@ -58,7 +59,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
58static int test_find_delalloc(u32 sectorsize) 59static int test_find_delalloc(u32 sectorsize)
59{ 60{
60 struct inode *inode; 61 struct inode *inode;
61 struct extent_io_tree tmp; 62 struct extent_io_tree *tmp;
62 struct page *page; 63 struct page *page;
63 struct page *locked_page = NULL; 64 struct page *locked_page = NULL;
64 unsigned long index = 0; 65 unsigned long index = 0;
@@ -76,12 +77,13 @@ static int test_find_delalloc(u32 sectorsize)
76 test_std_err(TEST_ALLOC_INODE); 77 test_std_err(TEST_ALLOC_INODE);
77 return -ENOMEM; 78 return -ENOMEM;
78 } 79 }
80 tmp = &BTRFS_I(inode)->io_tree;
79 81
80 /* 82 /*
81 * Passing NULL as we don't have fs_info but tracepoints are not used 83 * Passing NULL as we don't have fs_info but tracepoints are not used
82 * at this point 84 * at this point
83 */ 85 */
84 extent_io_tree_init(NULL, &tmp, IO_TREE_SELFTEST, NULL); 86 extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL);
85 87
86 /* 88 /*
87 * First go through and create and mark all of our pages dirty, we pin 89 * First go through and create and mark all of our pages dirty, we pin
@@ -108,10 +110,10 @@ static int test_find_delalloc(u32 sectorsize)
108 * |--- delalloc ---| 110 * |--- delalloc ---|
109 * |--- search ---| 111 * |--- search ---|
110 */ 112 */
111 set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); 113 set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL);
112 start = 0; 114 start = 0;
113 end = 0; 115 end = 0;
114 found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, 116 found = find_lock_delalloc_range(inode, locked_page, &start,
115 &end); 117 &end);
116 if (!found) { 118 if (!found) {
117 test_err("should have found at least one delalloc"); 119 test_err("should have found at least one delalloc");
@@ -122,7 +124,7 @@ static int test_find_delalloc(u32 sectorsize)
122 sectorsize - 1, start, end); 124 sectorsize - 1, start, end);
123 goto out_bits; 125 goto out_bits;
124 } 126 }
125 unlock_extent(&tmp, start, end); 127 unlock_extent(tmp, start, end);
126 unlock_page(locked_page); 128 unlock_page(locked_page);
127 put_page(locked_page); 129 put_page(locked_page);
128 130
@@ -139,10 +141,10 @@ static int test_find_delalloc(u32 sectorsize)
139 test_err("couldn't find the locked page"); 141 test_err("couldn't find the locked page");
140 goto out_bits; 142 goto out_bits;
141 } 143 }
142 set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); 144 set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL);
143 start = test_start; 145 start = test_start;
144 end = 0; 146 end = 0;
145 found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, 147 found = find_lock_delalloc_range(inode, locked_page, &start,
146 &end); 148 &end);
147 if (!found) { 149 if (!found) {
148 test_err("couldn't find delalloc in our range"); 150 test_err("couldn't find delalloc in our range");
@@ -158,7 +160,7 @@ static int test_find_delalloc(u32 sectorsize)
158 test_err("there were unlocked pages in the range"); 160 test_err("there were unlocked pages in the range");
159 goto out_bits; 161 goto out_bits;
160 } 162 }
161 unlock_extent(&tmp, start, end); 163 unlock_extent(tmp, start, end);
162 /* locked_page was unlocked above */ 164 /* locked_page was unlocked above */
163 put_page(locked_page); 165 put_page(locked_page);
164 166
@@ -176,7 +178,7 @@ static int test_find_delalloc(u32 sectorsize)
176 } 178 }
177 start = test_start; 179 start = test_start;
178 end = 0; 180 end = 0;
179 found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, 181 found = find_lock_delalloc_range(inode, locked_page, &start,
180 &end); 182 &end);
181 if (found) { 183 if (found) {
182 test_err("found range when we shouldn't have"); 184 test_err("found range when we shouldn't have");
@@ -194,10 +196,10 @@ static int test_find_delalloc(u32 sectorsize)
194 * 196 *
195 * We are re-using our test_start from above since it works out well. 197 * We are re-using our test_start from above since it works out well.
196 */ 198 */
197 set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); 199 set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL);
198 start = test_start; 200 start = test_start;
199 end = 0; 201 end = 0;
200 found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, 202 found = find_lock_delalloc_range(inode, locked_page, &start,
201 &end); 203 &end);
202 if (!found) { 204 if (!found) {
203 test_err("didn't find our range"); 205 test_err("didn't find our range");
@@ -213,7 +215,7 @@ static int test_find_delalloc(u32 sectorsize)
213 test_err("pages in range were not all locked"); 215 test_err("pages in range were not all locked");
214 goto out_bits; 216 goto out_bits;
215 } 217 }
216 unlock_extent(&tmp, start, end); 218 unlock_extent(tmp, start, end);
217 219
218 /* 220 /*
219 * Now to test where we run into a page that is no longer dirty in the 221 * Now to test where we run into a page that is no longer dirty in the
@@ -238,7 +240,7 @@ static int test_find_delalloc(u32 sectorsize)
238 * this changes at any point in the future we will need to fix this 240 * this changes at any point in the future we will need to fix this
239 * tests expected behavior. 241 * tests expected behavior.
240 */ 242 */
241 found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, 243 found = find_lock_delalloc_range(inode, locked_page, &start,
242 &end); 244 &end);
243 if (!found) { 245 if (!found) {
244 test_err("didn't find our range"); 246 test_err("didn't find our range");
@@ -256,7 +258,7 @@ static int test_find_delalloc(u32 sectorsize)
256 } 258 }
257 ret = 0; 259 ret = 0;
258out_bits: 260out_bits:
259 clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1); 261 clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1);
260out: 262out:
261 if (locked_page) 263 if (locked_page)
262 put_page(locked_page); 264 put_page(locked_page);
@@ -432,6 +434,89 @@ out:
432 return ret; 434 return ret;
433} 435}
434 436
437static int test_find_first_clear_extent_bit(void)
438{
439 struct extent_io_tree tree;
440 u64 start, end;
441
442 test_msg("running find_first_clear_extent_bit test");
443 extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL);
444
445 /*
446 * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between
447 * 4M-32M
448 */
449 set_extent_bits(&tree, SZ_1M, SZ_4M - 1,
450 CHUNK_TRIMMED | CHUNK_ALLOCATED);
451
452 find_first_clear_extent_bit(&tree, SZ_512K, &start, &end,
453 CHUNK_TRIMMED | CHUNK_ALLOCATED);
454
455 if (start != 0 || end != SZ_1M -1)
456 test_err("error finding beginning range: start %llu end %llu",
457 start, end);
458
459 /* Now add 32M-64M so that we have a hole between 4M-32M */
460 set_extent_bits(&tree, SZ_32M, SZ_64M - 1,
461 CHUNK_TRIMMED | CHUNK_ALLOCATED);
462
463 /*
464 * Request first hole starting at 12M, we should get 4M-32M
465 */
466 find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end,
467 CHUNK_TRIMMED | CHUNK_ALLOCATED);
468
469 if (start != SZ_4M || end != SZ_32M - 1)
470 test_err("error finding trimmed range: start %llu end %llu",
471 start, end);
472
473 /*
474 * Search in the middle of allocated range, should get the next one
475 * available, which happens to be unallocated -> 4M-32M
476 */
477 find_first_clear_extent_bit(&tree, SZ_2M, &start, &end,
478 CHUNK_TRIMMED | CHUNK_ALLOCATED);
479
480 if (start != SZ_4M || end != SZ_32M -1)
481 test_err("error finding next unalloc range: start %llu end %llu",
482 start, end);
483
484 /*
485 * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag
486 * being unset in this range, we should get the entry in range 64M-72M
487 */
488 set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED);
489 find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end,
490 CHUNK_TRIMMED);
491
492 if (start != SZ_64M || end != SZ_64M + SZ_8M - 1)
493 test_err("error finding exact range: start %llu end %llu",
494 start, end);
495
496 find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end,
497 CHUNK_TRIMMED);
498
499 /*
500 * Search in the middle of set range whose immediate neighbour doesn't
501 * have the bits set so it must be returned
502 */
503 if (start != SZ_64M || end != SZ_64M + SZ_8M - 1)
504 test_err("error finding next alloc range: start %llu end %llu",
505 start, end);
506
507 /*
508 * Search beyond any known range, shall return after last known range
509 * and end should be -1
510 */
511 find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED);
512 if (start != SZ_64M + SZ_8M || end != -1)
513 test_err(
514 "error handling beyond end of range search: start %llu end %llu",
515 start, end);
516
517 return 0;
518}
519
435int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) 520int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
436{ 521{
437 int ret; 522 int ret;
@@ -442,6 +527,10 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
442 if (ret) 527 if (ret)
443 goto out; 528 goto out;
444 529
530 ret = test_find_first_clear_extent_bit();
531 if (ret)
532 goto out;
533
445 ret = test_eb_bitmaps(sectorsize, nodesize); 534 ret = test_eb_bitmaps(sectorsize, nodesize);
446out: 535out:
447 return ret; 536 return ret;
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 87aeabe9d610..4a7f796c9900 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -66,7 +66,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
66 em->len = SZ_16K; 66 em->len = SZ_16K;
67 em->block_start = 0; 67 em->block_start = 0;
68 em->block_len = SZ_16K; 68 em->block_len = SZ_16K;
69 write_lock(&em_tree->lock);
69 ret = add_extent_mapping(em_tree, em, 0); 70 ret = add_extent_mapping(em_tree, em, 0);
71 write_unlock(&em_tree->lock);
70 if (ret < 0) { 72 if (ret < 0) {
71 test_err("cannot add extent range [0, 16K)"); 73 test_err("cannot add extent range [0, 16K)");
72 goto out; 74 goto out;
@@ -85,7 +87,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
85 em->len = SZ_4K; 87 em->len = SZ_4K;
86 em->block_start = SZ_32K; /* avoid merging */ 88 em->block_start = SZ_32K; /* avoid merging */
87 em->block_len = SZ_4K; 89 em->block_len = SZ_4K;
90 write_lock(&em_tree->lock);
88 ret = add_extent_mapping(em_tree, em, 0); 91 ret = add_extent_mapping(em_tree, em, 0);
92 write_unlock(&em_tree->lock);
89 if (ret < 0) { 93 if (ret < 0) {
90 test_err("cannot add extent range [16K, 20K)"); 94 test_err("cannot add extent range [16K, 20K)");
91 goto out; 95 goto out;
@@ -104,7 +108,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
104 em->len = len; 108 em->len = len;
105 em->block_start = start; 109 em->block_start = start;
106 em->block_len = len; 110 em->block_len = len;
111 write_lock(&em_tree->lock);
107 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); 112 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
113 write_unlock(&em_tree->lock);
108 if (ret) { 114 if (ret) {
109 test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); 115 test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
110 goto out; 116 goto out;
@@ -148,7 +154,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
148 em->len = SZ_1K; 154 em->len = SZ_1K;
149 em->block_start = EXTENT_MAP_INLINE; 155 em->block_start = EXTENT_MAP_INLINE;
150 em->block_len = (u64)-1; 156 em->block_len = (u64)-1;
157 write_lock(&em_tree->lock);
151 ret = add_extent_mapping(em_tree, em, 0); 158 ret = add_extent_mapping(em_tree, em, 0);
159 write_unlock(&em_tree->lock);
152 if (ret < 0) { 160 if (ret < 0) {
153 test_err("cannot add extent range [0, 1K)"); 161 test_err("cannot add extent range [0, 1K)");
154 goto out; 162 goto out;
@@ -167,7 +175,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
167 em->len = SZ_4K; 175 em->len = SZ_4K;
168 em->block_start = SZ_4K; 176 em->block_start = SZ_4K;
169 em->block_len = SZ_4K; 177 em->block_len = SZ_4K;
178 write_lock(&em_tree->lock);
170 ret = add_extent_mapping(em_tree, em, 0); 179 ret = add_extent_mapping(em_tree, em, 0);
180 write_unlock(&em_tree->lock);
171 if (ret < 0) { 181 if (ret < 0) {
172 test_err("cannot add extent range [4K, 8K)"); 182 test_err("cannot add extent range [4K, 8K)");
173 goto out; 183 goto out;
@@ -186,7 +196,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
186 em->len = SZ_1K; 196 em->len = SZ_1K;
187 em->block_start = EXTENT_MAP_INLINE; 197 em->block_start = EXTENT_MAP_INLINE;
188 em->block_len = (u64)-1; 198 em->block_len = (u64)-1;
199 write_lock(&em_tree->lock);
189 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); 200 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
201 write_unlock(&em_tree->lock);
190 if (ret) { 202 if (ret) {
191 test_err("case2 [0 1K]: ret %d", ret); 203 test_err("case2 [0 1K]: ret %d", ret);
192 goto out; 204 goto out;
@@ -225,7 +237,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
225 em->len = SZ_4K; 237 em->len = SZ_4K;
226 em->block_start = SZ_4K; 238 em->block_start = SZ_4K;
227 em->block_len = SZ_4K; 239 em->block_len = SZ_4K;
240 write_lock(&em_tree->lock);
228 ret = add_extent_mapping(em_tree, em, 0); 241 ret = add_extent_mapping(em_tree, em, 0);
242 write_unlock(&em_tree->lock);
229 if (ret < 0) { 243 if (ret < 0) {
230 test_err("cannot add extent range [4K, 8K)"); 244 test_err("cannot add extent range [4K, 8K)");
231 goto out; 245 goto out;
@@ -244,7 +258,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
244 em->len = SZ_16K; 258 em->len = SZ_16K;
245 em->block_start = 0; 259 em->block_start = 0;
246 em->block_len = SZ_16K; 260 em->block_len = SZ_16K;
261 write_lock(&em_tree->lock);
247 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); 262 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
263 write_unlock(&em_tree->lock);
248 if (ret) { 264 if (ret) {
249 test_err("case3 [0x%llx 0x%llx): ret %d", 265 test_err("case3 [0x%llx 0x%llx): ret %d",
250 start, start + len, ret); 266 start, start + len, ret);
@@ -320,7 +336,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
320 em->len = SZ_8K; 336 em->len = SZ_8K;
321 em->block_start = 0; 337 em->block_start = 0;
322 em->block_len = SZ_8K; 338 em->block_len = SZ_8K;
339 write_lock(&em_tree->lock);
323 ret = add_extent_mapping(em_tree, em, 0); 340 ret = add_extent_mapping(em_tree, em, 0);
341 write_unlock(&em_tree->lock);
324 if (ret < 0) { 342 if (ret < 0) {
325 test_err("cannot add extent range [0, 8K)"); 343 test_err("cannot add extent range [0, 8K)");
326 goto out; 344 goto out;
@@ -339,7 +357,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
339 em->len = 24 * SZ_1K; 357 em->len = 24 * SZ_1K;
340 em->block_start = SZ_16K; /* avoid merging */ 358 em->block_start = SZ_16K; /* avoid merging */
341 em->block_len = 24 * SZ_1K; 359 em->block_len = 24 * SZ_1K;
360 write_lock(&em_tree->lock);
342 ret = add_extent_mapping(em_tree, em, 0); 361 ret = add_extent_mapping(em_tree, em, 0);
362 write_unlock(&em_tree->lock);
343 if (ret < 0) { 363 if (ret < 0) {
344 test_err("cannot add extent range [8K, 32K)"); 364 test_err("cannot add extent range [8K, 32K)");
345 goto out; 365 goto out;
@@ -357,7 +377,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
357 em->len = SZ_32K; 377 em->len = SZ_32K;
358 em->block_start = 0; 378 em->block_start = 0;
359 em->block_len = SZ_32K; 379 em->block_len = SZ_32K;
380 write_lock(&em_tree->lock);
360 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); 381 ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
382 write_unlock(&em_tree->lock);
361 if (ret) { 383 if (ret) {
362 test_err("case4 [0x%llx 0x%llx): ret %d", 384 test_err("case4 [0x%llx 0x%llx): ret %d",
363 start, len, ret); 385 start, len, ret);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3f6811cdf803..3b8ae1a8f02d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -129,6 +129,24 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
129} 129}
130 130
131/* 131/*
132 * To be called after all the new block groups attached to the transaction
133 * handle have been created (btrfs_create_pending_block_groups()).
134 */
135void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
136{
137 struct btrfs_fs_info *fs_info = trans->fs_info;
138
139 if (!trans->chunk_bytes_reserved)
140 return;
141
142 WARN_ON_ONCE(!list_empty(&trans->new_bgs));
143
144 btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
145 trans->chunk_bytes_reserved);
146 trans->chunk_bytes_reserved = 0;
147}
148
149/*
132 * either allocate a new transaction or hop into the existing one 150 * either allocate a new transaction or hop into the existing one
133 */ 151 */
134static noinline int join_transaction(struct btrfs_fs_info *fs_info, 152static noinline int join_transaction(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 78c446c222b7..527ea94b57d9 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -224,5 +224,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction);
224void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); 224void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
225void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, 225void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
226 struct btrfs_root *root); 226 struct btrfs_root *root);
227void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
227 228
228#endif 229#endif
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 96fce4bef4e7..ccd5706199d7 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -132,6 +132,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
132 struct btrfs_file_extent_item *fi; 132 struct btrfs_file_extent_item *fi;
133 u32 sectorsize = fs_info->sectorsize; 133 u32 sectorsize = fs_info->sectorsize;
134 u32 item_size = btrfs_item_size_nr(leaf, slot); 134 u32 item_size = btrfs_item_size_nr(leaf, slot);
135 u64 extent_end;
135 136
136 if (!IS_ALIGNED(key->offset, sectorsize)) { 137 if (!IS_ALIGNED(key->offset, sectorsize)) {
137 file_extent_err(leaf, slot, 138 file_extent_err(leaf, slot,
@@ -207,6 +208,16 @@ static int check_extent_data_item(struct extent_buffer *leaf,
207 CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)) 208 CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize))
208 return -EUCLEAN; 209 return -EUCLEAN;
209 210
211 /* Catch extent end overflow */
212 if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi),
213 key->offset, &extent_end)) {
214 file_extent_err(leaf, slot,
215 "extent end overflow, have file offset %llu extent num bytes %llu",
216 key->offset,
217 btrfs_file_extent_num_bytes(leaf, fi));
218 return -EUCLEAN;
219 }
220
210 /* 221 /*
211 * Check that no two consecutive file extent items, in the same leaf, 222 * Check that no two consecutive file extent items, in the same leaf,
212 * present ranges that overlap each other. 223 * present ranges that overlap each other.
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3fc8d854d7fb..6c8297bcfeb7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3323,6 +3323,30 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3323} 3323}
3324 3324
3325/* 3325/*
3326 * Check if an inode was logged in the current transaction. We can't always rely
3327 * on an inode's logged_trans value, because it's an in-memory only field and
3328 * therefore not persisted. This means that its value is lost if the inode gets
3329 * evicted and loaded again from disk (in which case it has a value of 0, and
3330 * certainly it is smaller then any possible transaction ID), when that happens
3331 * the full_sync flag is set in the inode's runtime flags, so on that case we
3332 * assume eviction happened and ignore the logged_trans value, assuming the
3333 * worst case, that the inode was logged before in the current transaction.
3334 */
3335static bool inode_logged(struct btrfs_trans_handle *trans,
3336 struct btrfs_inode *inode)
3337{
3338 if (inode->logged_trans == trans->transid)
3339 return true;
3340
3341 if (inode->last_trans == trans->transid &&
3342 test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
3343 !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
3344 return true;
3345
3346 return false;
3347}
3348
3349/*
3326 * If both a file and directory are logged, and unlinks or renames are 3350 * If both a file and directory are logged, and unlinks or renames are
3327 * mixed in, we have a few interesting corners: 3351 * mixed in, we have a few interesting corners:
3328 * 3352 *
@@ -3356,7 +3380,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3356 int bytes_del = 0; 3380 int bytes_del = 0;
3357 u64 dir_ino = btrfs_ino(dir); 3381 u64 dir_ino = btrfs_ino(dir);
3358 3382
3359 if (dir->logged_trans < trans->transid) 3383 if (!inode_logged(trans, dir))
3360 return 0; 3384 return 0;
3361 3385
3362 ret = join_running_log_trans(root); 3386 ret = join_running_log_trans(root);
@@ -3460,7 +3484,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3460 u64 index; 3484 u64 index;
3461 int ret; 3485 int ret;
3462 3486
3463 if (inode->logged_trans < trans->transid) 3487 if (!inode_logged(trans, inode))
3464 return 0; 3488 return 0;
3465 3489
3466 ret = join_running_log_trans(root); 3490 ret = join_running_log_trans(root);
@@ -5420,9 +5444,19 @@ log_extents:
5420 } 5444 }
5421 } 5445 }
5422 5446
5447 /*
5448 * Don't update last_log_commit if we logged that an inode exists after
5449 * it was loaded to memory (full_sync bit set).
5450 * This is to prevent data loss when we do a write to the inode, then
5451 * the inode gets evicted after all delalloc was flushed, then we log
5452 * it exists (due to a rename for example) and then fsync it. This last
5453 * fsync would do nothing (not logging the extents previously written).
5454 */
5423 spin_lock(&inode->lock); 5455 spin_lock(&inode->lock);
5424 inode->logged_trans = trans->transid; 5456 inode->logged_trans = trans->transid;
5425 inode->last_log_commit = inode->last_sub_trans; 5457 if (inode_only != LOG_INODE_EXISTS ||
5458 !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5459 inode->last_log_commit = inode->last_sub_trans;
5426 spin_unlock(&inode->lock); 5460 spin_unlock(&inode->lock);
5427out_unlock: 5461out_unlock:
5428 mutex_unlock(&inode->log_mutex); 5462 mutex_unlock(&inode->log_mutex);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1c2a6e4b39da..a13ddba1ebc3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -28,6 +28,7 @@
28#include "dev-replace.h" 28#include "dev-replace.h"
29#include "sysfs.h" 29#include "sysfs.h"
30#include "tree-checker.h" 30#include "tree-checker.h"
31#include "space-info.h"
31 32
32const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 33const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
33 [BTRFS_RAID_RAID10] = { 34 [BTRFS_RAID_RAID10] = {
@@ -123,12 +124,14 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
123 }, 124 },
124}; 125};
125 126
126const char *get_raid_name(enum btrfs_raid_types type) 127const char *btrfs_bg_type_to_raid_name(u64 flags)
127{ 128{
128 if (type >= BTRFS_NR_RAID_TYPES) 129 const int index = btrfs_bg_flags_to_raid_index(flags);
130
131 if (index >= BTRFS_NR_RAID_TYPES)
129 return NULL; 132 return NULL;
130 133
131 return btrfs_raid_array[type].raid_name; 134 return btrfs_raid_array[index].raid_name;
132} 135}
133 136
134/* 137/*
@@ -237,7 +240,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
237 * chunk_mutex 240 * chunk_mutex
238 * ----------- 241 * -----------
239 * protects chunks, adding or removing during allocation, trim or when a new 242 * protects chunks, adding or removing during allocation, trim or when a new
240 * device is added/removed 243 * device is added/removed. Additionally it also protects post_commit_list of
244 * individual devices, since they can be added to the transaction's
245 * post_commit_list only with chunk_mutex held.
241 * 246 *
242 * cleaner_mutex 247 * cleaner_mutex
243 * ------------- 248 * -------------
@@ -1818,7 +1823,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1818 struct rb_node *n; 1823 struct rb_node *n;
1819 u64 ret = 0; 1824 u64 ret = 0;
1820 1825
1821 em_tree = &fs_info->mapping_tree.map_tree; 1826 em_tree = &fs_info->mapping_tree;
1822 read_lock(&em_tree->lock); 1827 read_lock(&em_tree->lock);
1823 n = rb_last(&em_tree->map.rb_root); 1828 n = rb_last(&em_tree->map.rb_root);
1824 if (n) { 1829 if (n) {
@@ -2941,7 +2946,7 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2941 struct extent_map_tree *em_tree; 2946 struct extent_map_tree *em_tree;
2942 struct extent_map *em; 2947 struct extent_map *em;
2943 2948
2944 em_tree = &fs_info->mapping_tree.map_tree; 2949 em_tree = &fs_info->mapping_tree;
2945 read_lock(&em_tree->lock); 2950 read_lock(&em_tree->lock);
2946 em = lookup_extent_mapping(em_tree, logical, length); 2951 em = lookup_extent_mapping(em_tree, logical, length);
2947 read_unlock(&em_tree->lock); 2952 read_unlock(&em_tree->lock);
@@ -3474,6 +3479,18 @@ static int chunk_devid_filter(struct extent_buffer *leaf,
3474 return 1; 3479 return 1;
3475} 3480}
3476 3481
3482static u64 calc_data_stripes(u64 type, int num_stripes)
3483{
3484 const int index = btrfs_bg_flags_to_raid_index(type);
3485 const int ncopies = btrfs_raid_array[index].ncopies;
3486 const int nparity = btrfs_raid_array[index].nparity;
3487
3488 if (nparity)
3489 return num_stripes - nparity;
3490 else
3491 return num_stripes / ncopies;
3492}
3493
3477/* [pstart, pend) */ 3494/* [pstart, pend) */
3478static int chunk_drange_filter(struct extent_buffer *leaf, 3495static int chunk_drange_filter(struct extent_buffer *leaf,
3479 struct btrfs_chunk *chunk, 3496 struct btrfs_chunk *chunk,
@@ -3483,22 +3500,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
3483 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3500 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3484 u64 stripe_offset; 3501 u64 stripe_offset;
3485 u64 stripe_length; 3502 u64 stripe_length;
3503 u64 type;
3486 int factor; 3504 int factor;
3487 int i; 3505 int i;
3488 3506
3489 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3507 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3490 return 0; 3508 return 0;
3491 3509
3492 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 3510 type = btrfs_chunk_type(leaf, chunk);
3493 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 3511 factor = calc_data_stripes(type, num_stripes);
3494 factor = num_stripes / 2;
3495 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3496 factor = num_stripes - 1;
3497 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3498 factor = num_stripes - 2;
3499 } else {
3500 factor = num_stripes;
3501 }
3502 3512
3503 for (i = 0; i < num_stripes; i++) { 3513 for (i = 0; i < num_stripes; i++) {
3504 stripe = btrfs_stripe_nr(chunk, i); 3514 stripe = btrfs_stripe_nr(chunk, i);
@@ -3921,11 +3931,9 @@ static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
3921 bp += ret; \ 3931 bp += ret; \
3922 } while (0) 3932 } while (0)
3923 3933
3924 if (flags & BTRFS_BALANCE_ARGS_CONVERT) { 3934 if (flags & BTRFS_BALANCE_ARGS_CONVERT)
3925 int index = btrfs_bg_flags_to_raid_index(bargs->target); 3935 CHECK_APPEND_1ARG("convert=%s,",
3926 3936 btrfs_bg_type_to_raid_name(bargs->target));
3927 CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index));
3928 }
3929 3937
3930 if (flags & BTRFS_BALANCE_ARGS_SOFT) 3938 if (flags & BTRFS_BALANCE_ARGS_SOFT)
3931 CHECK_APPEND_NOARG("soft,"); 3939 CHECK_APPEND_NOARG("soft,");
@@ -4047,6 +4055,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
4047 u64 num_devices; 4055 u64 num_devices;
4048 unsigned seq; 4056 unsigned seq;
4049 bool reducing_integrity; 4057 bool reducing_integrity;
4058 int i;
4050 4059
4051 if (btrfs_fs_closing(fs_info) || 4060 if (btrfs_fs_closing(fs_info) ||
4052 atomic_read(&fs_info->balance_pause_req) || 4061 atomic_read(&fs_info->balance_pause_req) ||
@@ -4076,48 +4085,43 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
4076 } 4085 }
4077 4086
4078 num_devices = btrfs_num_devices(fs_info); 4087 num_devices = btrfs_num_devices(fs_info);
4088 allowed = 0;
4089 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4090 if (num_devices >= btrfs_raid_array[i].devs_min)
4091 allowed |= btrfs_raid_array[i].bg_flag;
4079 4092
4080 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
4081 if (num_devices > 1)
4082 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
4083 if (num_devices > 2)
4084 allowed |= BTRFS_BLOCK_GROUP_RAID5;
4085 if (num_devices > 3)
4086 allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
4087 BTRFS_BLOCK_GROUP_RAID6);
4088 if (validate_convert_profile(&bctl->data, allowed)) { 4093 if (validate_convert_profile(&bctl->data, allowed)) {
4089 int index = btrfs_bg_flags_to_raid_index(bctl->data.target);
4090
4091 btrfs_err(fs_info, 4094 btrfs_err(fs_info,
4092 "balance: invalid convert data profile %s", 4095 "balance: invalid convert data profile %s",
4093 get_raid_name(index)); 4096 btrfs_bg_type_to_raid_name(bctl->data.target));
4094 ret = -EINVAL; 4097 ret = -EINVAL;
4095 goto out; 4098 goto out;
4096 } 4099 }
4097 if (validate_convert_profile(&bctl->meta, allowed)) { 4100 if (validate_convert_profile(&bctl->meta, allowed)) {
4098 int index = btrfs_bg_flags_to_raid_index(bctl->meta.target);
4099
4100 btrfs_err(fs_info, 4101 btrfs_err(fs_info,
4101 "balance: invalid convert metadata profile %s", 4102 "balance: invalid convert metadata profile %s",
4102 get_raid_name(index)); 4103 btrfs_bg_type_to_raid_name(bctl->meta.target));
4103 ret = -EINVAL; 4104 ret = -EINVAL;
4104 goto out; 4105 goto out;
4105 } 4106 }
4106 if (validate_convert_profile(&bctl->sys, allowed)) { 4107 if (validate_convert_profile(&bctl->sys, allowed)) {
4107 int index = btrfs_bg_flags_to_raid_index(bctl->sys.target);
4108
4109 btrfs_err(fs_info, 4108 btrfs_err(fs_info,
4110 "balance: invalid convert system profile %s", 4109 "balance: invalid convert system profile %s",
4111 get_raid_name(index)); 4110 btrfs_bg_type_to_raid_name(bctl->sys.target));
4112 ret = -EINVAL; 4111 ret = -EINVAL;
4113 goto out; 4112 goto out;
4114 } 4113 }
4115 4114
4116 /* allow to reduce meta or sys integrity only if force set */ 4115 /*
4117 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 4116 * Allow to reduce metadata or system integrity only if force set for
4118 BTRFS_BLOCK_GROUP_RAID10 | 4117 * profiles with redundancy (copies, parity)
4119 BTRFS_BLOCK_GROUP_RAID5 | 4118 */
4120 BTRFS_BLOCK_GROUP_RAID6; 4119 allowed = 0;
4120 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4121 if (btrfs_raid_array[i].ncopies >= 2 ||
4122 btrfs_raid_array[i].tolerated_failures >= 1)
4123 allowed |= btrfs_raid_array[i].bg_flag;
4124 }
4121 do { 4125 do {
4122 seq = read_seqbegin(&fs_info->profiles_lock); 4126 seq = read_seqbegin(&fs_info->profiles_lock);
4123 4127
@@ -4152,12 +4156,18 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
4152 4156
4153 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4157 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4154 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4158 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4155 int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
4156 int data_index = btrfs_bg_flags_to_raid_index(data_target);
4157
4158 btrfs_warn(fs_info, 4159 btrfs_warn(fs_info,
4159 "balance: metadata profile %s has lower redundancy than data profile %s", 4160 "balance: metadata profile %s has lower redundancy than data profile %s",
4160 get_raid_name(meta_index), get_raid_name(data_index)); 4161 btrfs_bg_type_to_raid_name(meta_target),
4162 btrfs_bg_type_to_raid_name(data_target));
4163 }
4164
4165 if (fs_info->send_in_progress) {
4166 btrfs_warn_rl(fs_info,
4167"cannot run balance while send operations are in progress (%d in progress)",
4168 fs_info->send_in_progress);
4169 ret = -EAGAIN;
4170 goto out;
4161 } 4171 }
4162 4172
4163 ret = insert_balance_item(fs_info, bctl); 4173 ret = insert_balance_item(fs_info, bctl);
@@ -4949,6 +4959,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4949 sub_stripes = btrfs_raid_array[index].sub_stripes; 4959 sub_stripes = btrfs_raid_array[index].sub_stripes;
4950 dev_stripes = btrfs_raid_array[index].dev_stripes; 4960 dev_stripes = btrfs_raid_array[index].dev_stripes;
4951 devs_max = btrfs_raid_array[index].devs_max; 4961 devs_max = btrfs_raid_array[index].devs_max;
4962 if (!devs_max)
4963 devs_max = BTRFS_MAX_DEVS(info);
4952 devs_min = btrfs_raid_array[index].devs_min; 4964 devs_min = btrfs_raid_array[index].devs_min;
4953 devs_increment = btrfs_raid_array[index].devs_increment; 4965 devs_increment = btrfs_raid_array[index].devs_increment;
4954 ncopies = btrfs_raid_array[index].ncopies; 4966 ncopies = btrfs_raid_array[index].ncopies;
@@ -4957,8 +4969,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4957 if (type & BTRFS_BLOCK_GROUP_DATA) { 4969 if (type & BTRFS_BLOCK_GROUP_DATA) {
4958 max_stripe_size = SZ_1G; 4970 max_stripe_size = SZ_1G;
4959 max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 4971 max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4960 if (!devs_max)
4961 devs_max = BTRFS_MAX_DEVS(info);
4962 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 4972 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4963 /* for larger filesystems, use larger metadata chunks */ 4973 /* for larger filesystems, use larger metadata chunks */
4964 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 4974 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
@@ -4966,13 +4976,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4966 else 4976 else
4967 max_stripe_size = SZ_256M; 4977 max_stripe_size = SZ_256M;
4968 max_chunk_size = max_stripe_size; 4978 max_chunk_size = max_stripe_size;
4969 if (!devs_max)
4970 devs_max = BTRFS_MAX_DEVS(info);
4971 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 4979 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4972 max_stripe_size = SZ_32M; 4980 max_stripe_size = SZ_32M;
4973 max_chunk_size = 2 * max_stripe_size; 4981 max_chunk_size = 2 * max_stripe_size;
4974 if (!devs_max)
4975 devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4976 } else { 4982 } else {
4977 btrfs_err(info, "invalid chunk type 0x%llx requested", 4983 btrfs_err(info, "invalid chunk type 0x%llx requested",
4978 type); 4984 type);
@@ -5143,7 +5149,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
5143 em->block_len = em->len; 5149 em->block_len = em->len;
5144 em->orig_block_len = stripe_size; 5150 em->orig_block_len = stripe_size;
5145 5151
5146 em_tree = &info->mapping_tree.map_tree; 5152 em_tree = &info->mapping_tree;
5147 write_lock(&em_tree->lock); 5153 write_lock(&em_tree->lock);
5148 ret = add_extent_mapping(em_tree, em, 0); 5154 ret = add_extent_mapping(em_tree, em, 0);
5149 if (ret) { 5155 if (ret) {
@@ -5324,20 +5330,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5324 5330
5325static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5331static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5326{ 5332{
5327 int max_errors; 5333 const int index = btrfs_bg_flags_to_raid_index(map->type);
5328 5334
5329 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5335 return btrfs_raid_array[index].tolerated_failures;
5330 BTRFS_BLOCK_GROUP_RAID10 |
5331 BTRFS_BLOCK_GROUP_RAID5 |
5332 BTRFS_BLOCK_GROUP_DUP)) {
5333 max_errors = 1;
5334 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
5335 max_errors = 2;
5336 } else {
5337 max_errors = 0;
5338 }
5339
5340 return max_errors;
5341} 5336}
5342 5337
5343int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5338int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
@@ -5378,21 +5373,16 @@ end:
5378 return readonly; 5373 return readonly;
5379} 5374}
5380 5375
5381void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 5376void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5382{
5383 extent_map_tree_init(&tree->map_tree);
5384}
5385
5386void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
5387{ 5377{
5388 struct extent_map *em; 5378 struct extent_map *em;
5389 5379
5390 while (1) { 5380 while (1) {
5391 write_lock(&tree->map_tree.lock); 5381 write_lock(&tree->lock);
5392 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); 5382 em = lookup_extent_mapping(tree, 0, (u64)-1);
5393 if (em) 5383 if (em)
5394 remove_extent_mapping(&tree->map_tree, em); 5384 remove_extent_mapping(tree, em);
5395 write_unlock(&tree->map_tree.lock); 5385 write_unlock(&tree->lock);
5396 if (!em) 5386 if (!em)
5397 break; 5387 break;
5398 /* once for us */ 5388 /* once for us */
@@ -5419,7 +5409,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5419 return 1; 5409 return 1;
5420 5410
5421 map = em->map_lookup; 5411 map = em->map_lookup;
5422 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) 5412 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5423 ret = map->num_stripes; 5413 ret = map->num_stripes;
5424 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5414 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5425 ret = map->sub_stripes; 5415 ret = map->sub_stripes;
@@ -5493,7 +5483,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
5493 struct btrfs_device *srcdev; 5483 struct btrfs_device *srcdev;
5494 5484
5495 ASSERT((map->type & 5485 ASSERT((map->type &
5496 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); 5486 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5497 5487
5498 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5488 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5499 num_stripes = map->sub_stripes; 5489 num_stripes = map->sub_stripes;
@@ -5682,7 +5672,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5682 &remaining_stripes); 5672 &remaining_stripes);
5683 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 5673 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5684 last_stripe *= sub_stripes; 5674 last_stripe *= sub_stripes;
5685 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5675 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5686 BTRFS_BLOCK_GROUP_DUP)) { 5676 BTRFS_BLOCK_GROUP_DUP)) {
5687 num_stripes = map->num_stripes; 5677 num_stripes = map->num_stripes;
5688 } else { 5678 } else {
@@ -5926,6 +5916,102 @@ static bool need_full_stripe(enum btrfs_map_op op)
5926 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 5916 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
5927} 5917}
5928 5918
5919/*
5920 * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
5921 * tuple. This information is used to calculate how big a
5922 * particular bio can get before it straddles a stripe.
5923 *
5924 * @fs_info - the filesystem
5925 * @logical - address that we want to figure out the geometry of
5926 * @len - the length of IO we are going to perform, starting at @logical
5927 * @op - type of operation - write or read
5928 * @io_geom - pointer used to return values
5929 *
5930 * Returns < 0 in case a chunk for the given logical address cannot be found,
5931 * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
5932 */
5933int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5934 u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
5935{
5936 struct extent_map *em;
5937 struct map_lookup *map;
5938 u64 offset;
5939 u64 stripe_offset;
5940 u64 stripe_nr;
5941 u64 stripe_len;
5942 u64 raid56_full_stripe_start = (u64)-1;
5943 int data_stripes;
5944
5945 ASSERT(op != BTRFS_MAP_DISCARD);
5946
5947 em = btrfs_get_chunk_map(fs_info, logical, len);
5948 if (IS_ERR(em))
5949 return PTR_ERR(em);
5950
5951 map = em->map_lookup;
5952 /* Offset of this logical address in the chunk */
5953 offset = logical - em->start;
5954 /* Len of a stripe in a chunk */
5955 stripe_len = map->stripe_len;
5956 /* Stripe wher this block falls in */
5957 stripe_nr = div64_u64(offset, stripe_len);
5958 /* Offset of stripe in the chunk */
5959 stripe_offset = stripe_nr * stripe_len;
5960 if (offset < stripe_offset) {
5961 btrfs_crit(fs_info,
5962"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
5963 stripe_offset, offset, em->start, logical, stripe_len);
5964 free_extent_map(em);
5965 return -EINVAL;
5966 }
5967
5968 /* stripe_offset is the offset of this block in its stripe */
5969 stripe_offset = offset - stripe_offset;
5970 data_stripes = nr_data_stripes(map);
5971
5972 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5973 u64 max_len = stripe_len - stripe_offset;
5974
5975 /*
5976 * In case of raid56, we need to know the stripe aligned start
5977 */
5978 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5979 unsigned long full_stripe_len = stripe_len * data_stripes;
5980 raid56_full_stripe_start = offset;
5981
5982 /*
5983 * Allow a write of a full stripe, but make sure we
5984 * don't allow straddling of stripes
5985 */
5986 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
5987 full_stripe_len);
5988 raid56_full_stripe_start *= full_stripe_len;
5989
5990 /*
5991 * For writes to RAID[56], allow a full stripeset across
5992 * all disks. For other RAID types and for RAID[56]
5993 * reads, just allow a single stripe (on a single disk).
5994 */
5995 if (op == BTRFS_MAP_WRITE) {
5996 max_len = stripe_len * data_stripes -
5997 (offset - raid56_full_stripe_start);
5998 }
5999 }
6000 len = min_t(u64, em->len - offset, max_len);
6001 } else {
6002 len = em->len - offset;
6003 }
6004
6005 io_geom->len = len;
6006 io_geom->offset = offset;
6007 io_geom->stripe_len = stripe_len;
6008 io_geom->stripe_nr = stripe_nr;
6009 io_geom->stripe_offset = stripe_offset;
6010 io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6011
6012 return 0;
6013}
6014
5929static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 6015static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5930 enum btrfs_map_op op, 6016 enum btrfs_map_op op,
5931 u64 logical, u64 *length, 6017 u64 logical, u64 *length,
@@ -5939,6 +6025,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5939 u64 stripe_nr; 6025 u64 stripe_nr;
5940 u64 stripe_len; 6026 u64 stripe_len;
5941 u32 stripe_index; 6027 u32 stripe_index;
6028 int data_stripes;
5942 int i; 6029 int i;
5943 int ret = 0; 6030 int ret = 0;
5944 int num_stripes; 6031 int num_stripes;
@@ -5951,76 +6038,29 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5951 int patch_the_first_stripe_for_dev_replace = 0; 6038 int patch_the_first_stripe_for_dev_replace = 0;
5952 u64 physical_to_patch_in_first_stripe = 0; 6039 u64 physical_to_patch_in_first_stripe = 0;
5953 u64 raid56_full_stripe_start = (u64)-1; 6040 u64 raid56_full_stripe_start = (u64)-1;
6041 struct btrfs_io_geometry geom;
6042
6043 ASSERT(bbio_ret);
5954 6044
5955 if (op == BTRFS_MAP_DISCARD) 6045 if (op == BTRFS_MAP_DISCARD)
5956 return __btrfs_map_block_for_discard(fs_info, logical, 6046 return __btrfs_map_block_for_discard(fs_info, logical,
5957 *length, bbio_ret); 6047 *length, bbio_ret);
5958 6048
5959 em = btrfs_get_chunk_map(fs_info, logical, *length); 6049 ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
5960 if (IS_ERR(em)) 6050 if (ret < 0)
5961 return PTR_ERR(em); 6051 return ret;
5962 6052
6053 em = btrfs_get_chunk_map(fs_info, logical, *length);
6054 ASSERT(em);
5963 map = em->map_lookup; 6055 map = em->map_lookup;
5964 offset = logical - em->start;
5965
5966 stripe_len = map->stripe_len;
5967 stripe_nr = offset;
5968 /*
5969 * stripe_nr counts the total number of stripes we have to stride
5970 * to get to this block
5971 */
5972 stripe_nr = div64_u64(stripe_nr, stripe_len);
5973
5974 stripe_offset = stripe_nr * stripe_len;
5975 if (offset < stripe_offset) {
5976 btrfs_crit(fs_info,
5977 "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
5978 stripe_offset, offset, em->start, logical,
5979 stripe_len);
5980 free_extent_map(em);
5981 return -EINVAL;
5982 }
5983
5984 /* stripe_offset is the offset of this block in its stripe*/
5985 stripe_offset = offset - stripe_offset;
5986
5987 /* if we're here for raid56, we need to know the stripe aligned start */
5988 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5989 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
5990 raid56_full_stripe_start = offset;
5991 6056
5992 /* allow a write of a full stripe, but make sure we don't 6057 *length = geom.len;
5993 * allow straddling of stripes 6058 offset = geom.offset;
5994 */ 6059 stripe_len = geom.stripe_len;
5995 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 6060 stripe_nr = geom.stripe_nr;
5996 full_stripe_len); 6061 stripe_offset = geom.stripe_offset;
5997 raid56_full_stripe_start *= full_stripe_len; 6062 raid56_full_stripe_start = geom.raid56_stripe_offset;
5998 } 6063 data_stripes = nr_data_stripes(map);
5999
6000 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
6001 u64 max_len;
6002 /* For writes to RAID[56], allow a full stripeset across all disks.
6003 For other RAID types and for RAID[56] reads, just allow a single
6004 stripe (on a single disk). */
6005 if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6006 (op == BTRFS_MAP_WRITE)) {
6007 max_len = stripe_len * nr_data_stripes(map) -
6008 (offset - raid56_full_stripe_start);
6009 } else {
6010 /* we limit the length of each bio to what fits in a stripe */
6011 max_len = stripe_len - stripe_offset;
6012 }
6013 *length = min_t(u64, em->len - offset, max_len);
6014 } else {
6015 *length = em->len - offset;
6016 }
6017
6018 /*
6019 * This is for when we're called from btrfs_bio_fits_in_stripe and all
6020 * it cares about is the length
6021 */
6022 if (!bbio_ret)
6023 goto out;
6024 6064
6025 down_read(&dev_replace->rwsem); 6065 down_read(&dev_replace->rwsem);
6026 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 6066 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
@@ -6052,7 +6092,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6052 &stripe_index); 6092 &stripe_index);
6053 if (!need_full_stripe(op)) 6093 if (!need_full_stripe(op))
6054 mirror_num = 1; 6094 mirror_num = 1;
6055 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 6095 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6056 if (need_full_stripe(op)) 6096 if (need_full_stripe(op))
6057 num_stripes = map->num_stripes; 6097 num_stripes = map->num_stripes;
6058 else if (mirror_num) 6098 else if (mirror_num)
@@ -6094,7 +6134,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6094 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 6134 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6095 /* push stripe_nr back to the start of the full stripe */ 6135 /* push stripe_nr back to the start of the full stripe */
6096 stripe_nr = div64_u64(raid56_full_stripe_start, 6136 stripe_nr = div64_u64(raid56_full_stripe_start,
6097 stripe_len * nr_data_stripes(map)); 6137 stripe_len * data_stripes);
6098 6138
6099 /* RAID[56] write or recovery. Return all stripes */ 6139 /* RAID[56] write or recovery. Return all stripes */
6100 num_stripes = map->num_stripes; 6140 num_stripes = map->num_stripes;
@@ -6110,10 +6150,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6110 * Mirror #3 is RAID6 Q block. 6150 * Mirror #3 is RAID6 Q block.
6111 */ 6151 */
6112 stripe_nr = div_u64_rem(stripe_nr, 6152 stripe_nr = div_u64_rem(stripe_nr,
6113 nr_data_stripes(map), &stripe_index); 6153 data_stripes, &stripe_index);
6114 if (mirror_num > 1) 6154 if (mirror_num > 1)
6115 stripe_index = nr_data_stripes(map) + 6155 stripe_index = data_stripes + mirror_num - 2;
6116 mirror_num - 2;
6117 6156
6118 /* We distribute the parity blocks across stripes */ 6157 /* We distribute the parity blocks across stripes */
6119 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 6158 div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
@@ -6171,8 +6210,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6171 div_u64_rem(stripe_nr, num_stripes, &rot); 6210 div_u64_rem(stripe_nr, num_stripes, &rot);
6172 6211
6173 /* Fill in the logical address of each stripe */ 6212 /* Fill in the logical address of each stripe */
6174 tmp = stripe_nr * nr_data_stripes(map); 6213 tmp = stripe_nr * data_stripes;
6175 for (i = 0; i < nr_data_stripes(map); i++) 6214 for (i = 0; i < data_stripes; i++)
6176 bbio->raid_map[(i+rot) % num_stripes] = 6215 bbio->raid_map[(i+rot) % num_stripes] =
6177 em->start + (tmp + i) * map->stripe_len; 6216 em->start + (tmp + i) * map->stripe_len;
6178 6217
@@ -6687,7 +6726,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6687 struct btrfs_chunk *chunk) 6726 struct btrfs_chunk *chunk)
6688{ 6727{
6689 struct btrfs_fs_info *fs_info = leaf->fs_info; 6728 struct btrfs_fs_info *fs_info = leaf->fs_info;
6690 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 6729 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6691 struct map_lookup *map; 6730 struct map_lookup *map;
6692 struct extent_map *em; 6731 struct extent_map *em;
6693 u64 logical; 6732 u64 logical;
@@ -6712,9 +6751,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6712 return ret; 6751 return ret;
6713 } 6752 }
6714 6753
6715 read_lock(&map_tree->map_tree.lock); 6754 read_lock(&map_tree->lock);
6716 em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); 6755 em = lookup_extent_mapping(map_tree, logical, 1);
6717 read_unlock(&map_tree->map_tree.lock); 6756 read_unlock(&map_tree->lock);
6718 6757
6719 /* already mapped? */ 6758 /* already mapped? */
6720 if (em && em->start <= logical && em->start + em->len > logical) { 6759 if (em && em->start <= logical && em->start + em->len > logical) {
@@ -6783,9 +6822,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6783 6822
6784 } 6823 }
6785 6824
6786 write_lock(&map_tree->map_tree.lock); 6825 write_lock(&map_tree->lock);
6787 ret = add_extent_mapping(&map_tree->map_tree, em, 0); 6826 ret = add_extent_mapping(map_tree, em, 0);
6788 write_unlock(&map_tree->map_tree.lock); 6827 write_unlock(&map_tree->lock);
6789 if (ret < 0) { 6828 if (ret < 0) {
6790 btrfs_err(fs_info, 6829 btrfs_err(fs_info,
6791 "failed to add chunk map, start=%llu len=%llu: %d", 6830 "failed to add chunk map, start=%llu len=%llu: %d",
@@ -7103,14 +7142,14 @@ out_short_read:
7103bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7142bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7104 struct btrfs_device *failing_dev) 7143 struct btrfs_device *failing_dev)
7105{ 7144{
7106 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 7145 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7107 struct extent_map *em; 7146 struct extent_map *em;
7108 u64 next_start = 0; 7147 u64 next_start = 0;
7109 bool ret = true; 7148 bool ret = true;
7110 7149
7111 read_lock(&map_tree->map_tree.lock); 7150 read_lock(&map_tree->lock);
7112 em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); 7151 em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7113 read_unlock(&map_tree->map_tree.lock); 7152 read_unlock(&map_tree->lock);
7114 /* No chunk at all? Return false anyway */ 7153 /* No chunk at all? Return false anyway */
7115 if (!em) { 7154 if (!em) {
7116 ret = false; 7155 ret = false;
@@ -7148,10 +7187,10 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7148 next_start = extent_map_end(em); 7187 next_start = extent_map_end(em);
7149 free_extent_map(em); 7188 free_extent_map(em);
7150 7189
7151 read_lock(&map_tree->map_tree.lock); 7190 read_lock(&map_tree->lock);
7152 em = lookup_extent_mapping(&map_tree->map_tree, next_start, 7191 em = lookup_extent_mapping(map_tree, next_start,
7153 (u64)(-1) - next_start); 7192 (u64)(-1) - next_start);
7154 read_unlock(&map_tree->map_tree.lock); 7193 read_unlock(&map_tree->lock);
7155 } 7194 }
7156out: 7195out:
7157 return ret; 7196 return ret;
@@ -7600,10 +7639,9 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
7600 */ 7639 */
7601int btrfs_bg_type_to_factor(u64 flags) 7640int btrfs_bg_type_to_factor(u64 flags)
7602{ 7641{
7603 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 7642 const int index = btrfs_bg_flags_to_raid_index(flags);
7604 BTRFS_BLOCK_GROUP_RAID10)) 7643
7605 return 2; 7644 return btrfs_raid_array[index].ncopies;
7606 return 1;
7607} 7645}
7608 7646
7609 7647
@@ -7612,7 +7650,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7612 u64 chunk_offset, u64 devid, 7650 u64 chunk_offset, u64 devid,
7613 u64 physical_offset, u64 physical_len) 7651 u64 physical_offset, u64 physical_len)
7614{ 7652{
7615 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 7653 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7616 struct extent_map *em; 7654 struct extent_map *em;
7617 struct map_lookup *map; 7655 struct map_lookup *map;
7618 struct btrfs_device *dev; 7656 struct btrfs_device *dev;
@@ -7701,7 +7739,7 @@ out:
7701 7739
7702static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 7740static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7703{ 7741{
7704 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 7742 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7705 struct extent_map *em; 7743 struct extent_map *em;
7706 struct rb_node *node; 7744 struct rb_node *node;
7707 int ret = 0; 7745 int ret = 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 136a3eb64604..7f6aa1816409 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -23,6 +23,21 @@ struct btrfs_pending_bios {
23 struct bio *tail; 23 struct bio *tail;
24}; 24};
25 25
26struct btrfs_io_geometry {
27 /* remaining bytes before crossing a stripe */
28 u64 len;
29 /* offset of logical address in chunk */
30 u64 offset;
31 /* length of single IO stripe */
32 u64 stripe_len;
33 /* number of stripe where address falls */
34 u64 stripe_nr;
35 /* offset of address in stripe */
36 u64 stripe_offset;
37 /* offset of raid56 stripe into the chunk */
38 u64 raid56_stripe_offset;
39};
40
26/* 41/*
27 * Use sequence counter to get consistent device stat data on 42 * Use sequence counter to get consistent device stat data on
28 * 32-bit processors. 43 * 32-bit processors.
@@ -43,8 +58,8 @@ struct btrfs_pending_bios {
43#define BTRFS_DEV_STATE_FLUSH_SENT (4) 58#define BTRFS_DEV_STATE_FLUSH_SENT (4)
44 59
45struct btrfs_device { 60struct btrfs_device {
46 struct list_head dev_list; 61 struct list_head dev_list; /* device_list_mutex */
47 struct list_head dev_alloc_list; 62 struct list_head dev_alloc_list; /* chunk mutex */
48 struct list_head post_commit_list; /* chunk mutex */ 63 struct list_head post_commit_list; /* chunk mutex */
49 struct btrfs_fs_devices *fs_devices; 64 struct btrfs_fs_devices *fs_devices;
50 struct btrfs_fs_info *fs_info; 65 struct btrfs_fs_info *fs_info;
@@ -229,9 +244,14 @@ struct btrfs_fs_devices {
229 * this mutex lock. 244 * this mutex lock.
230 */ 245 */
231 struct mutex device_list_mutex; 246 struct mutex device_list_mutex;
247
248 /* List of all devices, protected by device_list_mutex */
232 struct list_head devices; 249 struct list_head devices;
233 250
234 /* devices not currently being allocated */ 251 /*
252 * Devices which can satisfy space allocation. Protected by
253 * chunk_mutex
254 */
235 struct list_head alloc_list; 255 struct list_head alloc_list;
236 256
237 struct btrfs_fs_devices *seed; 257 struct btrfs_fs_devices *seed;
@@ -336,16 +356,16 @@ struct btrfs_device_info {
336}; 356};
337 357
338struct btrfs_raid_attr { 358struct btrfs_raid_attr {
339 int sub_stripes; /* sub_stripes info for map */ 359 u8 sub_stripes; /* sub_stripes info for map */
340 int dev_stripes; /* stripes per dev */ 360 u8 dev_stripes; /* stripes per dev */
341 int devs_max; /* max devs to use */ 361 u8 devs_max; /* max devs to use */
342 int devs_min; /* min devs needed */ 362 u8 devs_min; /* min devs needed */
343 int tolerated_failures; /* max tolerated fail devs */ 363 u8 tolerated_failures; /* max tolerated fail devs */
344 int devs_increment; /* ndevs has to be a multiple of this */ 364 u8 devs_increment; /* ndevs has to be a multiple of this */
345 int ncopies; /* how many copies to data has */ 365 u8 ncopies; /* how many copies to data has */
346 int nparity; /* number of stripes worth of bytes to store 366 u8 nparity; /* number of stripes worth of bytes to store
347 * parity information */ 367 * parity information */
348 int mindev_error; /* error code if min devs requisite is unmet */ 368 u8 mindev_error; /* error code if min devs requisite is unmet */
349 const char raid_name[8]; /* name of the raid */ 369 const char raid_name[8]; /* name of the raid */
350 u64 bg_flag; /* block group flag of the raid */ 370 u64 bg_flag; /* block group flag of the raid */
351}; 371};
@@ -408,13 +428,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
408int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 428int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
409 u64 logical, u64 *length, 429 u64 logical, u64 *length,
410 struct btrfs_bio **bbio_ret); 430 struct btrfs_bio **bbio_ret);
431int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
432 u64 logical, u64 len, struct btrfs_io_geometry *io_geom);
411int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, 433int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
412 u64 physical, u64 **logical, int *naddrs, int *stripe_len); 434 u64 physical, u64 **logical, int *naddrs, int *stripe_len);
413int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); 435int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
414int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); 436int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
415int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); 437int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
416void btrfs_mapping_init(struct btrfs_mapping_tree *tree); 438void btrfs_mapping_tree_free(struct extent_map_tree *tree);
417void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
418blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 439blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
419 int mirror_num, int async_submit); 440 int mirror_num, int async_submit);
420int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 441int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
@@ -557,8 +578,6 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
557 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 578 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
558} 579}
559 580
560const char *get_raid_name(enum btrfs_raid_types type);
561
562void btrfs_commit_device_sizes(struct btrfs_transaction *trans); 581void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
563 582
564struct list_head *btrfs_get_fs_uuids(void); 583struct list_head *btrfs_get_fs_uuids(void);
@@ -568,6 +587,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
568 struct btrfs_device *failing_dev); 587 struct btrfs_device *failing_dev);
569 588
570int btrfs_bg_type_to_factor(u64 flags); 589int btrfs_bg_type_to_factor(u64 flags);
590const char *btrfs_bg_type_to_raid_name(u64 flags);
571int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); 591int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
572 592
573#endif 593#endif