diff options
Diffstat (limited to 'fs/btrfs')
50 files changed, 3786 insertions, 3124 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 23537bc8c827..212b4a854f2c 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig | |||
@@ -2,7 +2,8 @@ | |||
2 | 2 | ||
3 | config BTRFS_FS | 3 | config BTRFS_FS |
4 | tristate "Btrfs filesystem support" | 4 | tristate "Btrfs filesystem support" |
5 | select LIBCRC32C | 5 | select CRYPTO |
6 | select CRYPTO_CRC32C | ||
6 | select ZLIB_INFLATE | 7 | select ZLIB_INFLATE |
7 | select ZLIB_DEFLATE | 8 | select ZLIB_DEFLATE |
8 | select LZO_COMPRESS | 9 | select LZO_COMPRESS |
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ca693dd554e9..76a843198bcb 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile | |||
@@ -10,7 +10,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ | |||
10 | export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ | 10 | export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ |
11 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ | 11 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ |
12 | reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ | 12 | reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ |
13 | uuid-tree.o props.o free-space-tree.o tree-checker.o | 13 | uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ |
14 | block-rsv.o delalloc-space.o | ||
14 | 15 | ||
15 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o | 16 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o |
16 | btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o | 17 | btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o |
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 982152d3f920..89116afda7a2 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c | |||
@@ -1465,12 +1465,11 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, | |||
1465 | * | 1465 | * |
1466 | * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. | 1466 | * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. |
1467 | */ | 1467 | */ |
1468 | int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) | 1468 | int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, |
1469 | struct ulist *roots, struct ulist *tmp) | ||
1469 | { | 1470 | { |
1470 | struct btrfs_fs_info *fs_info = root->fs_info; | 1471 | struct btrfs_fs_info *fs_info = root->fs_info; |
1471 | struct btrfs_trans_handle *trans; | 1472 | struct btrfs_trans_handle *trans; |
1472 | struct ulist *tmp = NULL; | ||
1473 | struct ulist *roots = NULL; | ||
1474 | struct ulist_iterator uiter; | 1473 | struct ulist_iterator uiter; |
1475 | struct ulist_node *node; | 1474 | struct ulist_node *node; |
1476 | struct seq_list elem = SEQ_LIST_INIT(elem); | 1475 | struct seq_list elem = SEQ_LIST_INIT(elem); |
@@ -1481,12 +1480,8 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) | |||
1481 | .share_count = 0, | 1480 | .share_count = 0, |
1482 | }; | 1481 | }; |
1483 | 1482 | ||
1484 | tmp = ulist_alloc(GFP_NOFS); | 1483 | ulist_init(roots); |
1485 | roots = ulist_alloc(GFP_NOFS); | 1484 | ulist_init(tmp); |
1486 | if (!tmp || !roots) { | ||
1487 | ret = -ENOMEM; | ||
1488 | goto out; | ||
1489 | } | ||
1490 | 1485 | ||
1491 | trans = btrfs_attach_transaction(root); | 1486 | trans = btrfs_attach_transaction(root); |
1492 | if (IS_ERR(trans)) { | 1487 | if (IS_ERR(trans)) { |
@@ -1527,8 +1522,8 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) | |||
1527 | up_read(&fs_info->commit_root_sem); | 1522 | up_read(&fs_info->commit_root_sem); |
1528 | } | 1523 | } |
1529 | out: | 1524 | out: |
1530 | ulist_free(tmp); | 1525 | ulist_release(roots); |
1531 | ulist_free(roots); | 1526 | ulist_release(tmp); |
1532 | return ret; | 1527 | return ret; |
1533 | } | 1528 | } |
1534 | 1529 | ||
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 54d58988483a..777f61dc081e 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h | |||
@@ -57,7 +57,8 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, | |||
57 | u64 start_off, struct btrfs_path *path, | 57 | u64 start_off, struct btrfs_path *path, |
58 | struct btrfs_inode_extref **ret_extref, | 58 | struct btrfs_inode_extref **ret_extref, |
59 | u64 *found_off); | 59 | u64 *found_off); |
60 | int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr); | 60 | int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, |
61 | struct ulist *roots, struct ulist *tmp_ulist); | ||
61 | 62 | ||
62 | int __init btrfs_prelim_ref_init(void); | 63 | int __init btrfs_prelim_ref_init(void); |
63 | void __cold btrfs_prelim_ref_exit(void); | 64 | void __cold btrfs_prelim_ref_exit(void); |
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c new file mode 100644 index 000000000000..698470b9f32d --- /dev/null +++ b/fs/btrfs/block-rsv.c | |||
@@ -0,0 +1,425 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | #include "ctree.h" | ||
4 | #include "block-rsv.h" | ||
5 | #include "space-info.h" | ||
6 | #include "math.h" | ||
7 | #include "transaction.h" | ||
8 | |||
9 | static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, | ||
10 | struct btrfs_block_rsv *block_rsv, | ||
11 | struct btrfs_block_rsv *dest, u64 num_bytes, | ||
12 | u64 *qgroup_to_release_ret) | ||
13 | { | ||
14 | struct btrfs_space_info *space_info = block_rsv->space_info; | ||
15 | u64 qgroup_to_release = 0; | ||
16 | u64 ret; | ||
17 | |||
18 | spin_lock(&block_rsv->lock); | ||
19 | if (num_bytes == (u64)-1) { | ||
20 | num_bytes = block_rsv->size; | ||
21 | qgroup_to_release = block_rsv->qgroup_rsv_size; | ||
22 | } | ||
23 | block_rsv->size -= num_bytes; | ||
24 | if (block_rsv->reserved >= block_rsv->size) { | ||
25 | num_bytes = block_rsv->reserved - block_rsv->size; | ||
26 | block_rsv->reserved = block_rsv->size; | ||
27 | block_rsv->full = 1; | ||
28 | } else { | ||
29 | num_bytes = 0; | ||
30 | } | ||
31 | if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { | ||
32 | qgroup_to_release = block_rsv->qgroup_rsv_reserved - | ||
33 | block_rsv->qgroup_rsv_size; | ||
34 | block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; | ||
35 | } else { | ||
36 | qgroup_to_release = 0; | ||
37 | } | ||
38 | spin_unlock(&block_rsv->lock); | ||
39 | |||
40 | ret = num_bytes; | ||
41 | if (num_bytes > 0) { | ||
42 | if (dest) { | ||
43 | spin_lock(&dest->lock); | ||
44 | if (!dest->full) { | ||
45 | u64 bytes_to_add; | ||
46 | |||
47 | bytes_to_add = dest->size - dest->reserved; | ||
48 | bytes_to_add = min(num_bytes, bytes_to_add); | ||
49 | dest->reserved += bytes_to_add; | ||
50 | if (dest->reserved >= dest->size) | ||
51 | dest->full = 1; | ||
52 | num_bytes -= bytes_to_add; | ||
53 | } | ||
54 | spin_unlock(&dest->lock); | ||
55 | } | ||
56 | if (num_bytes) | ||
57 | btrfs_space_info_add_old_bytes(fs_info, space_info, | ||
58 | num_bytes); | ||
59 | } | ||
60 | if (qgroup_to_release_ret) | ||
61 | *qgroup_to_release_ret = qgroup_to_release; | ||
62 | return ret; | ||
63 | } | ||
64 | |||
65 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, | ||
66 | struct btrfs_block_rsv *dst, u64 num_bytes, | ||
67 | bool update_size) | ||
68 | { | ||
69 | int ret; | ||
70 | |||
71 | ret = btrfs_block_rsv_use_bytes(src, num_bytes); | ||
72 | if (ret) | ||
73 | return ret; | ||
74 | |||
75 | btrfs_block_rsv_add_bytes(dst, num_bytes, update_size); | ||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) | ||
80 | { | ||
81 | memset(rsv, 0, sizeof(*rsv)); | ||
82 | spin_lock_init(&rsv->lock); | ||
83 | rsv->type = type; | ||
84 | } | ||
85 | |||
86 | void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, | ||
87 | struct btrfs_block_rsv *rsv, | ||
88 | unsigned short type) | ||
89 | { | ||
90 | btrfs_init_block_rsv(rsv, type); | ||
91 | rsv->space_info = btrfs_find_space_info(fs_info, | ||
92 | BTRFS_BLOCK_GROUP_METADATA); | ||
93 | } | ||
94 | |||
95 | struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, | ||
96 | unsigned short type) | ||
97 | { | ||
98 | struct btrfs_block_rsv *block_rsv; | ||
99 | |||
100 | block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); | ||
101 | if (!block_rsv) | ||
102 | return NULL; | ||
103 | |||
104 | btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); | ||
105 | return block_rsv; | ||
106 | } | ||
107 | |||
108 | void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, | ||
109 | struct btrfs_block_rsv *rsv) | ||
110 | { | ||
111 | if (!rsv) | ||
112 | return; | ||
113 | btrfs_block_rsv_release(fs_info, rsv, (u64)-1); | ||
114 | kfree(rsv); | ||
115 | } | ||
116 | |||
117 | int btrfs_block_rsv_add(struct btrfs_root *root, | ||
118 | struct btrfs_block_rsv *block_rsv, u64 num_bytes, | ||
119 | enum btrfs_reserve_flush_enum flush) | ||
120 | { | ||
121 | int ret; | ||
122 | |||
123 | if (num_bytes == 0) | ||
124 | return 0; | ||
125 | |||
126 | ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); | ||
127 | if (!ret) | ||
128 | btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); | ||
129 | |||
130 | return ret; | ||
131 | } | ||
132 | |||
133 | int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) | ||
134 | { | ||
135 | u64 num_bytes = 0; | ||
136 | int ret = -ENOSPC; | ||
137 | |||
138 | if (!block_rsv) | ||
139 | return 0; | ||
140 | |||
141 | spin_lock(&block_rsv->lock); | ||
142 | num_bytes = div_factor(block_rsv->size, min_factor); | ||
143 | if (block_rsv->reserved >= num_bytes) | ||
144 | ret = 0; | ||
145 | spin_unlock(&block_rsv->lock); | ||
146 | |||
147 | return ret; | ||
148 | } | ||
149 | |||
150 | int btrfs_block_rsv_refill(struct btrfs_root *root, | ||
151 | struct btrfs_block_rsv *block_rsv, u64 min_reserved, | ||
152 | enum btrfs_reserve_flush_enum flush) | ||
153 | { | ||
154 | u64 num_bytes = 0; | ||
155 | int ret = -ENOSPC; | ||
156 | |||
157 | if (!block_rsv) | ||
158 | return 0; | ||
159 | |||
160 | spin_lock(&block_rsv->lock); | ||
161 | num_bytes = min_reserved; | ||
162 | if (block_rsv->reserved >= num_bytes) | ||
163 | ret = 0; | ||
164 | else | ||
165 | num_bytes -= block_rsv->reserved; | ||
166 | spin_unlock(&block_rsv->lock); | ||
167 | |||
168 | if (!ret) | ||
169 | return 0; | ||
170 | |||
171 | ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); | ||
172 | if (!ret) { | ||
173 | btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); | ||
174 | return 0; | ||
175 | } | ||
176 | |||
177 | return ret; | ||
178 | } | ||
179 | |||
180 | u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, | ||
181 | struct btrfs_block_rsv *block_rsv, | ||
182 | u64 num_bytes, u64 *qgroup_to_release) | ||
183 | { | ||
184 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
185 | struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; | ||
186 | struct btrfs_block_rsv *target = NULL; | ||
187 | |||
188 | /* | ||
189 | * If we are the delayed_rsv then push to the global rsv, otherwise dump | ||
190 | * into the delayed rsv if it is not full. | ||
191 | */ | ||
192 | if (block_rsv == delayed_rsv) | ||
193 | target = global_rsv; | ||
194 | else if (block_rsv != global_rsv && !delayed_rsv->full) | ||
195 | target = delayed_rsv; | ||
196 | |||
197 | if (target && block_rsv->space_info != target->space_info) | ||
198 | target = NULL; | ||
199 | |||
200 | return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, | ||
201 | qgroup_to_release); | ||
202 | } | ||
203 | |||
204 | int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) | ||
205 | { | ||
206 | int ret = -ENOSPC; | ||
207 | |||
208 | spin_lock(&block_rsv->lock); | ||
209 | if (block_rsv->reserved >= num_bytes) { | ||
210 | block_rsv->reserved -= num_bytes; | ||
211 | if (block_rsv->reserved < block_rsv->size) | ||
212 | block_rsv->full = 0; | ||
213 | ret = 0; | ||
214 | } | ||
215 | spin_unlock(&block_rsv->lock); | ||
216 | return ret; | ||
217 | } | ||
218 | |||
219 | void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, | ||
220 | u64 num_bytes, bool update_size) | ||
221 | { | ||
222 | spin_lock(&block_rsv->lock); | ||
223 | block_rsv->reserved += num_bytes; | ||
224 | if (update_size) | ||
225 | block_rsv->size += num_bytes; | ||
226 | else if (block_rsv->reserved >= block_rsv->size) | ||
227 | block_rsv->full = 1; | ||
228 | spin_unlock(&block_rsv->lock); | ||
229 | } | ||
230 | |||
231 | int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, | ||
232 | struct btrfs_block_rsv *dest, u64 num_bytes, | ||
233 | int min_factor) | ||
234 | { | ||
235 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
236 | u64 min_bytes; | ||
237 | |||
238 | if (global_rsv->space_info != dest->space_info) | ||
239 | return -ENOSPC; | ||
240 | |||
241 | spin_lock(&global_rsv->lock); | ||
242 | min_bytes = div_factor(global_rsv->size, min_factor); | ||
243 | if (global_rsv->reserved < min_bytes + num_bytes) { | ||
244 | spin_unlock(&global_rsv->lock); | ||
245 | return -ENOSPC; | ||
246 | } | ||
247 | global_rsv->reserved -= num_bytes; | ||
248 | if (global_rsv->reserved < global_rsv->size) | ||
249 | global_rsv->full = 0; | ||
250 | spin_unlock(&global_rsv->lock); | ||
251 | |||
252 | btrfs_block_rsv_add_bytes(dest, num_bytes, true); | ||
253 | return 0; | ||
254 | } | ||
255 | |||
256 | void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) | ||
257 | { | ||
258 | struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; | ||
259 | struct btrfs_space_info *sinfo = block_rsv->space_info; | ||
260 | u64 num_bytes; | ||
261 | |||
262 | /* | ||
263 | * The global block rsv is based on the size of the extent tree, the | ||
264 | * checksum tree and the root tree. If the fs is empty we want to set | ||
265 | * it to a minimal amount for safety. | ||
266 | */ | ||
267 | num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + | ||
268 | btrfs_root_used(&fs_info->csum_root->root_item) + | ||
269 | btrfs_root_used(&fs_info->tree_root->root_item); | ||
270 | num_bytes = max_t(u64, num_bytes, SZ_16M); | ||
271 | |||
272 | spin_lock(&sinfo->lock); | ||
273 | spin_lock(&block_rsv->lock); | ||
274 | |||
275 | block_rsv->size = min_t(u64, num_bytes, SZ_512M); | ||
276 | |||
277 | if (block_rsv->reserved < block_rsv->size) { | ||
278 | num_bytes = btrfs_space_info_used(sinfo, true); | ||
279 | if (sinfo->total_bytes > num_bytes) { | ||
280 | num_bytes = sinfo->total_bytes - num_bytes; | ||
281 | num_bytes = min(num_bytes, | ||
282 | block_rsv->size - block_rsv->reserved); | ||
283 | block_rsv->reserved += num_bytes; | ||
284 | btrfs_space_info_update_bytes_may_use(fs_info, sinfo, | ||
285 | num_bytes); | ||
286 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
287 | sinfo->flags, num_bytes, | ||
288 | 1); | ||
289 | } | ||
290 | } else if (block_rsv->reserved > block_rsv->size) { | ||
291 | num_bytes = block_rsv->reserved - block_rsv->size; | ||
292 | btrfs_space_info_update_bytes_may_use(fs_info, sinfo, | ||
293 | -num_bytes); | ||
294 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
295 | sinfo->flags, num_bytes, 0); | ||
296 | block_rsv->reserved = block_rsv->size; | ||
297 | } | ||
298 | |||
299 | if (block_rsv->reserved == block_rsv->size) | ||
300 | block_rsv->full = 1; | ||
301 | else | ||
302 | block_rsv->full = 0; | ||
303 | |||
304 | spin_unlock(&block_rsv->lock); | ||
305 | spin_unlock(&sinfo->lock); | ||
306 | } | ||
307 | |||
308 | void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) | ||
309 | { | ||
310 | struct btrfs_space_info *space_info; | ||
311 | |||
312 | space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); | ||
313 | fs_info->chunk_block_rsv.space_info = space_info; | ||
314 | |||
315 | space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); | ||
316 | fs_info->global_block_rsv.space_info = space_info; | ||
317 | fs_info->trans_block_rsv.space_info = space_info; | ||
318 | fs_info->empty_block_rsv.space_info = space_info; | ||
319 | fs_info->delayed_block_rsv.space_info = space_info; | ||
320 | fs_info->delayed_refs_rsv.space_info = space_info; | ||
321 | |||
322 | fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; | ||
323 | fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; | ||
324 | fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; | ||
325 | fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; | ||
326 | if (fs_info->quota_root) | ||
327 | fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; | ||
328 | fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; | ||
329 | |||
330 | btrfs_update_global_block_rsv(fs_info); | ||
331 | } | ||
332 | |||
333 | void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info) | ||
334 | { | ||
335 | btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1); | ||
336 | WARN_ON(fs_info->trans_block_rsv.size > 0); | ||
337 | WARN_ON(fs_info->trans_block_rsv.reserved > 0); | ||
338 | WARN_ON(fs_info->chunk_block_rsv.size > 0); | ||
339 | WARN_ON(fs_info->chunk_block_rsv.reserved > 0); | ||
340 | WARN_ON(fs_info->delayed_block_rsv.size > 0); | ||
341 | WARN_ON(fs_info->delayed_block_rsv.reserved > 0); | ||
342 | WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); | ||
343 | WARN_ON(fs_info->delayed_refs_rsv.size > 0); | ||
344 | } | ||
345 | |||
346 | static struct btrfs_block_rsv *get_block_rsv( | ||
347 | const struct btrfs_trans_handle *trans, | ||
348 | const struct btrfs_root *root) | ||
349 | { | ||
350 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
351 | struct btrfs_block_rsv *block_rsv = NULL; | ||
352 | |||
353 | if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || | ||
354 | (root == fs_info->csum_root && trans->adding_csums) || | ||
355 | (root == fs_info->uuid_root)) | ||
356 | block_rsv = trans->block_rsv; | ||
357 | |||
358 | if (!block_rsv) | ||
359 | block_rsv = root->block_rsv; | ||
360 | |||
361 | if (!block_rsv) | ||
362 | block_rsv = &fs_info->empty_block_rsv; | ||
363 | |||
364 | return block_rsv; | ||
365 | } | ||
366 | |||
367 | struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, | ||
368 | struct btrfs_root *root, | ||
369 | u32 blocksize) | ||
370 | { | ||
371 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
372 | struct btrfs_block_rsv *block_rsv; | ||
373 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
374 | int ret; | ||
375 | bool global_updated = false; | ||
376 | |||
377 | block_rsv = get_block_rsv(trans, root); | ||
378 | |||
379 | if (unlikely(block_rsv->size == 0)) | ||
380 | goto try_reserve; | ||
381 | again: | ||
382 | ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize); | ||
383 | if (!ret) | ||
384 | return block_rsv; | ||
385 | |||
386 | if (block_rsv->failfast) | ||
387 | return ERR_PTR(ret); | ||
388 | |||
389 | if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { | ||
390 | global_updated = true; | ||
391 | btrfs_update_global_block_rsv(fs_info); | ||
392 | goto again; | ||
393 | } | ||
394 | |||
395 | /* | ||
396 | * The global reserve still exists to save us from ourselves, so don't | ||
397 | * warn_on if we are short on our delayed refs reserve. | ||
398 | */ | ||
399 | if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && | ||
400 | btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { | ||
401 | static DEFINE_RATELIMIT_STATE(_rs, | ||
402 | DEFAULT_RATELIMIT_INTERVAL * 10, | ||
403 | /*DEFAULT_RATELIMIT_BURST*/ 1); | ||
404 | if (__ratelimit(&_rs)) | ||
405 | WARN(1, KERN_DEBUG | ||
406 | "BTRFS: block rsv returned %d\n", ret); | ||
407 | } | ||
408 | try_reserve: | ||
409 | ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, | ||
410 | BTRFS_RESERVE_NO_FLUSH); | ||
411 | if (!ret) | ||
412 | return block_rsv; | ||
413 | /* | ||
414 | * If we couldn't reserve metadata bytes try and use some from | ||
415 | * the global reserve if its space type is the same as the global | ||
416 | * reservation. | ||
417 | */ | ||
418 | if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && | ||
419 | block_rsv->space_info == global_rsv->space_info) { | ||
420 | ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize); | ||
421 | if (!ret) | ||
422 | return global_rsv; | ||
423 | } | ||
424 | return ERR_PTR(ret); | ||
425 | } | ||
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h new file mode 100644 index 000000000000..d1428bb73fc5 --- /dev/null +++ b/fs/btrfs/block-rsv.h | |||
@@ -0,0 +1,101 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | |||
3 | #ifndef BTRFS_BLOCK_RSV_H | ||
4 | #define BTRFS_BLOCK_RSV_H | ||
5 | |||
6 | struct btrfs_trans_handle; | ||
7 | enum btrfs_reserve_flush_enum; | ||
8 | |||
9 | /* | ||
10 | * Types of block reserves | ||
11 | */ | ||
12 | enum { | ||
13 | BTRFS_BLOCK_RSV_GLOBAL, | ||
14 | BTRFS_BLOCK_RSV_DELALLOC, | ||
15 | BTRFS_BLOCK_RSV_TRANS, | ||
16 | BTRFS_BLOCK_RSV_CHUNK, | ||
17 | BTRFS_BLOCK_RSV_DELOPS, | ||
18 | BTRFS_BLOCK_RSV_DELREFS, | ||
19 | BTRFS_BLOCK_RSV_EMPTY, | ||
20 | BTRFS_BLOCK_RSV_TEMP, | ||
21 | }; | ||
22 | |||
23 | struct btrfs_block_rsv { | ||
24 | u64 size; | ||
25 | u64 reserved; | ||
26 | struct btrfs_space_info *space_info; | ||
27 | spinlock_t lock; | ||
28 | unsigned short full; | ||
29 | unsigned short type; | ||
30 | unsigned short failfast; | ||
31 | |||
32 | /* | ||
33 | * Qgroup equivalent for @size @reserved | ||
34 | * | ||
35 | * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care | ||
36 | * about things like csum size nor how many tree blocks it will need to | ||
37 | * reserve. | ||
38 | * | ||
39 | * Qgroup cares more about net change of the extent usage. | ||
40 | * | ||
41 | * So for one newly inserted file extent, in worst case it will cause | ||
42 | * leaf split and level increase, nodesize for each file extent is | ||
43 | * already too much. | ||
44 | * | ||
45 | * In short, qgroup_size/reserved is the upper limit of possible needed | ||
46 | * qgroup metadata reservation. | ||
47 | */ | ||
48 | u64 qgroup_rsv_size; | ||
49 | u64 qgroup_rsv_reserved; | ||
50 | }; | ||
51 | |||
52 | void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); | ||
53 | struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, | ||
54 | unsigned short type); | ||
55 | void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, | ||
56 | struct btrfs_block_rsv *rsv, | ||
57 | unsigned short type); | ||
58 | void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, | ||
59 | struct btrfs_block_rsv *rsv); | ||
60 | int btrfs_block_rsv_add(struct btrfs_root *root, | ||
61 | struct btrfs_block_rsv *block_rsv, u64 num_bytes, | ||
62 | enum btrfs_reserve_flush_enum flush); | ||
63 | int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); | ||
64 | int btrfs_block_rsv_refill(struct btrfs_root *root, | ||
65 | struct btrfs_block_rsv *block_rsv, u64 min_reserved, | ||
66 | enum btrfs_reserve_flush_enum flush); | ||
67 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, | ||
68 | struct btrfs_block_rsv *dst_rsv, u64 num_bytes, | ||
69 | bool update_size); | ||
70 | int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); | ||
71 | int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, | ||
72 | struct btrfs_block_rsv *dest, u64 num_bytes, | ||
73 | int min_factor); | ||
74 | void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, | ||
75 | u64 num_bytes, bool update_size); | ||
76 | u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, | ||
77 | struct btrfs_block_rsv *block_rsv, | ||
78 | u64 num_bytes, u64 *qgroup_to_release); | ||
79 | void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info); | ||
80 | void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info); | ||
81 | void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); | ||
82 | struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, | ||
83 | struct btrfs_root *root, | ||
84 | u32 blocksize); | ||
85 | |||
86 | static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, | ||
87 | struct btrfs_block_rsv *block_rsv, | ||
88 | u64 num_bytes) | ||
89 | { | ||
90 | __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); | ||
91 | } | ||
92 | |||
93 | static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, | ||
94 | struct btrfs_block_rsv *block_rsv, | ||
95 | u32 blocksize) | ||
96 | { | ||
97 | btrfs_block_rsv_add_bytes(block_rsv, blocksize, false); | ||
98 | btrfs_block_rsv_release(fs_info, block_rsv, 0); | ||
99 | } | ||
100 | |||
101 | #endif /* BTRFS_BLOCK_RSV_H */ | ||
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d5b438706b77..f853835c409c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h | |||
@@ -337,22 +337,34 @@ static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) | |||
337 | clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); | 337 | clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); |
338 | } | 338 | } |
339 | 339 | ||
340 | /* Array of bytes with variable length, hexadecimal format 0x1234 */ | ||
341 | #define CSUM_FMT "0x%*phN" | ||
342 | #define CSUM_FMT_VALUE(size, bytes) size, bytes | ||
343 | |||
340 | static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, | 344 | static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, |
341 | u64 logical_start, u32 csum, u32 csum_expected, int mirror_num) | 345 | u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) |
342 | { | 346 | { |
343 | struct btrfs_root *root = inode->root; | 347 | struct btrfs_root *root = inode->root; |
348 | struct btrfs_super_block *sb = root->fs_info->super_copy; | ||
349 | const u16 csum_size = btrfs_super_csum_size(sb); | ||
344 | 350 | ||
345 | /* Output minus objectid, which is more meaningful */ | 351 | /* Output minus objectid, which is more meaningful */ |
346 | if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) | 352 | if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) |
347 | btrfs_warn_rl(root->fs_info, | 353 | btrfs_warn_rl(root->fs_info, |
348 | "csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d", | 354 | "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", |
349 | root->root_key.objectid, btrfs_ino(inode), | 355 | root->root_key.objectid, btrfs_ino(inode), |
350 | logical_start, csum, csum_expected, mirror_num); | 356 | logical_start, |
357 | CSUM_FMT_VALUE(csum_size, csum), | ||
358 | CSUM_FMT_VALUE(csum_size, csum_expected), | ||
359 | mirror_num); | ||
351 | else | 360 | else |
352 | btrfs_warn_rl(root->fs_info, | 361 | btrfs_warn_rl(root->fs_info, |
353 | "csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d", | 362 | "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", |
354 | root->root_key.objectid, btrfs_ino(inode), | 363 | root->root_key.objectid, btrfs_ino(inode), |
355 | logical_start, csum, csum_expected, mirror_num); | 364 | logical_start, |
365 | CSUM_FMT_VALUE(csum_size, csum), | ||
366 | CSUM_FMT_VALUE(csum_size, csum_expected), | ||
367 | mirror_num); | ||
356 | } | 368 | } |
357 | 369 | ||
358 | #endif | 370 | #endif |
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b0c8094528d1..81a9731959a9 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c | |||
@@ -83,7 +83,7 @@ | |||
83 | #include <linux/blkdev.h> | 83 | #include <linux/blkdev.h> |
84 | #include <linux/mm.h> | 84 | #include <linux/mm.h> |
85 | #include <linux/string.h> | 85 | #include <linux/string.h> |
86 | #include <linux/crc32c.h> | 86 | #include <crypto/hash.h> |
87 | #include "ctree.h" | 87 | #include "ctree.h" |
88 | #include "disk-io.h" | 88 | #include "disk-io.h" |
89 | #include "transaction.h" | 89 | #include "transaction.h" |
@@ -1710,9 +1710,9 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, | |||
1710 | char **datav, unsigned int num_pages) | 1710 | char **datav, unsigned int num_pages) |
1711 | { | 1711 | { |
1712 | struct btrfs_fs_info *fs_info = state->fs_info; | 1712 | struct btrfs_fs_info *fs_info = state->fs_info; |
1713 | SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); | ||
1713 | struct btrfs_header *h; | 1714 | struct btrfs_header *h; |
1714 | u8 csum[BTRFS_CSUM_SIZE]; | 1715 | u8 csum[BTRFS_CSUM_SIZE]; |
1715 | u32 crc = ~(u32)0; | ||
1716 | unsigned int i; | 1716 | unsigned int i; |
1717 | 1717 | ||
1718 | if (num_pages * PAGE_SIZE < state->metablock_size) | 1718 | if (num_pages * PAGE_SIZE < state->metablock_size) |
@@ -1723,14 +1723,17 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, | |||
1723 | if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) | 1723 | if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) |
1724 | return 1; | 1724 | return 1; |
1725 | 1725 | ||
1726 | shash->tfm = fs_info->csum_shash; | ||
1727 | crypto_shash_init(shash); | ||
1728 | |||
1726 | for (i = 0; i < num_pages; i++) { | 1729 | for (i = 0; i < num_pages; i++) { |
1727 | u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); | 1730 | u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); |
1728 | size_t sublen = i ? PAGE_SIZE : | 1731 | size_t sublen = i ? PAGE_SIZE : |
1729 | (PAGE_SIZE - BTRFS_CSUM_SIZE); | 1732 | (PAGE_SIZE - BTRFS_CSUM_SIZE); |
1730 | 1733 | ||
1731 | crc = crc32c(crc, data, sublen); | 1734 | crypto_shash_update(shash, data, sublen); |
1732 | } | 1735 | } |
1733 | btrfs_csum_final(crc, csum); | 1736 | crypto_shash_final(shash, csum); |
1734 | if (memcmp(csum, h->csum, state->csum_size)) | 1737 | if (memcmp(csum, h->csum, state->csum_size)) |
1735 | return 1; | 1738 | return 1; |
1736 | 1739 | ||
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 84dd4a8980c5..60c47b417a4b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
18 | #include <linux/sched/mm.h> | 18 | #include <linux/sched/mm.h> |
19 | #include <linux/log2.h> | 19 | #include <linux/log2.h> |
20 | #include <crypto/hash.h> | ||
20 | #include "ctree.h" | 21 | #include "ctree.h" |
21 | #include "disk-io.h" | 22 | #include "disk-io.h" |
22 | #include "transaction.h" | 23 | #include "transaction.h" |
@@ -42,6 +43,22 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type) | |||
42 | return NULL; | 43 | return NULL; |
43 | } | 44 | } |
44 | 45 | ||
46 | bool btrfs_compress_is_valid_type(const char *str, size_t len) | ||
47 | { | ||
48 | int i; | ||
49 | |||
50 | for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) { | ||
51 | size_t comp_len = strlen(btrfs_compress_types[i]); | ||
52 | |||
53 | if (len < comp_len) | ||
54 | continue; | ||
55 | |||
56 | if (!strncmp(btrfs_compress_types[i], str, comp_len)) | ||
57 | return true; | ||
58 | } | ||
59 | return false; | ||
60 | } | ||
61 | |||
45 | static int btrfs_decompress_bio(struct compressed_bio *cb); | 62 | static int btrfs_decompress_bio(struct compressed_bio *cb); |
46 | 63 | ||
47 | static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, | 64 | static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, |
@@ -57,32 +74,37 @@ static int check_compressed_csum(struct btrfs_inode *inode, | |||
57 | struct compressed_bio *cb, | 74 | struct compressed_bio *cb, |
58 | u64 disk_start) | 75 | u64 disk_start) |
59 | { | 76 | { |
77 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||
78 | SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); | ||
79 | const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); | ||
60 | int ret; | 80 | int ret; |
61 | struct page *page; | 81 | struct page *page; |
62 | unsigned long i; | 82 | unsigned long i; |
63 | char *kaddr; | 83 | char *kaddr; |
64 | u32 csum; | 84 | u8 csum[BTRFS_CSUM_SIZE]; |
65 | u32 *cb_sum = &cb->sums; | 85 | u8 *cb_sum = cb->sums; |
66 | 86 | ||
67 | if (inode->flags & BTRFS_INODE_NODATASUM) | 87 | if (inode->flags & BTRFS_INODE_NODATASUM) |
68 | return 0; | 88 | return 0; |
69 | 89 | ||
90 | shash->tfm = fs_info->csum_shash; | ||
91 | |||
70 | for (i = 0; i < cb->nr_pages; i++) { | 92 | for (i = 0; i < cb->nr_pages; i++) { |
71 | page = cb->compressed_pages[i]; | 93 | page = cb->compressed_pages[i]; |
72 | csum = ~(u32)0; | ||
73 | 94 | ||
95 | crypto_shash_init(shash); | ||
74 | kaddr = kmap_atomic(page); | 96 | kaddr = kmap_atomic(page); |
75 | csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE); | 97 | crypto_shash_update(shash, kaddr, PAGE_SIZE); |
76 | btrfs_csum_final(csum, (u8 *)&csum); | ||
77 | kunmap_atomic(kaddr); | 98 | kunmap_atomic(kaddr); |
99 | crypto_shash_final(shash, (u8 *)&csum); | ||
78 | 100 | ||
79 | if (csum != *cb_sum) { | 101 | if (memcmp(&csum, cb_sum, csum_size)) { |
80 | btrfs_print_data_csum_error(inode, disk_start, csum, | 102 | btrfs_print_data_csum_error(inode, disk_start, |
81 | *cb_sum, cb->mirror_num); | 103 | csum, cb_sum, cb->mirror_num); |
82 | ret = -EIO; | 104 | ret = -EIO; |
83 | goto fail; | 105 | goto fail; |
84 | } | 106 | } |
85 | cb_sum++; | 107 | cb_sum += csum_size; |
86 | 108 | ||
87 | } | 109 | } |
88 | ret = 0; | 110 | ret = 0; |
@@ -318,7 +340,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, | |||
318 | 340 | ||
319 | bdev = fs_info->fs_devices->latest_bdev; | 341 | bdev = fs_info->fs_devices->latest_bdev; |
320 | 342 | ||
321 | bio = btrfs_bio_alloc(bdev, first_byte); | 343 | bio = btrfs_bio_alloc(first_byte); |
344 | bio_set_dev(bio, bdev); | ||
322 | bio->bi_opf = REQ_OP_WRITE | write_flags; | 345 | bio->bi_opf = REQ_OP_WRITE | write_flags; |
323 | bio->bi_private = cb; | 346 | bio->bi_private = cb; |
324 | bio->bi_end_io = end_compressed_bio_write; | 347 | bio->bi_end_io = end_compressed_bio_write; |
@@ -360,7 +383,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, | |||
360 | bio_endio(bio); | 383 | bio_endio(bio); |
361 | } | 384 | } |
362 | 385 | ||
363 | bio = btrfs_bio_alloc(bdev, first_byte); | 386 | bio = btrfs_bio_alloc(first_byte); |
387 | bio_set_dev(bio, bdev); | ||
364 | bio->bi_opf = REQ_OP_WRITE | write_flags; | 388 | bio->bi_opf = REQ_OP_WRITE | write_flags; |
365 | bio->bi_private = cb; | 389 | bio->bi_private = cb; |
366 | bio->bi_end_io = end_compressed_bio_write; | 390 | bio->bi_end_io = end_compressed_bio_write; |
@@ -536,7 +560,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, | |||
536 | struct extent_map *em; | 560 | struct extent_map *em; |
537 | blk_status_t ret = BLK_STS_RESOURCE; | 561 | blk_status_t ret = BLK_STS_RESOURCE; |
538 | int faili = 0; | 562 | int faili = 0; |
539 | u32 *sums; | 563 | const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); |
564 | u8 *sums; | ||
540 | 565 | ||
541 | em_tree = &BTRFS_I(inode)->extent_tree; | 566 | em_tree = &BTRFS_I(inode)->extent_tree; |
542 | 567 | ||
@@ -558,7 +583,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, | |||
558 | cb->errors = 0; | 583 | cb->errors = 0; |
559 | cb->inode = inode; | 584 | cb->inode = inode; |
560 | cb->mirror_num = mirror_num; | 585 | cb->mirror_num = mirror_num; |
561 | sums = &cb->sums; | 586 | sums = cb->sums; |
562 | 587 | ||
563 | cb->start = em->orig_start; | 588 | cb->start = em->orig_start; |
564 | em_len = em->len; | 589 | em_len = em->len; |
@@ -597,7 +622,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, | |||
597 | /* include any pages we added in add_ra-bio_pages */ | 622 | /* include any pages we added in add_ra-bio_pages */ |
598 | cb->len = bio->bi_iter.bi_size; | 623 | cb->len = bio->bi_iter.bi_size; |
599 | 624 | ||
600 | comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); | 625 | comp_bio = btrfs_bio_alloc(cur_disk_byte); |
626 | bio_set_dev(comp_bio, bdev); | ||
601 | comp_bio->bi_opf = REQ_OP_READ; | 627 | comp_bio->bi_opf = REQ_OP_READ; |
602 | comp_bio->bi_private = cb; | 628 | comp_bio->bi_private = cb; |
603 | comp_bio->bi_end_io = end_compressed_bio_read; | 629 | comp_bio->bi_end_io = end_compressed_bio_read; |
@@ -617,6 +643,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, | |||
617 | page->mapping = NULL; | 643 | page->mapping = NULL; |
618 | if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < | 644 | if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < |
619 | PAGE_SIZE) { | 645 | PAGE_SIZE) { |
646 | unsigned int nr_sectors; | ||
647 | |||
620 | ret = btrfs_bio_wq_end_io(fs_info, comp_bio, | 648 | ret = btrfs_bio_wq_end_io(fs_info, comp_bio, |
621 | BTRFS_WQ_ENDIO_DATA); | 649 | BTRFS_WQ_ENDIO_DATA); |
622 | BUG_ON(ret); /* -ENOMEM */ | 650 | BUG_ON(ret); /* -ENOMEM */ |
@@ -634,8 +662,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, | |||
634 | sums); | 662 | sums); |
635 | BUG_ON(ret); /* -ENOMEM */ | 663 | BUG_ON(ret); /* -ENOMEM */ |
636 | } | 664 | } |
637 | sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, | 665 | |
638 | fs_info->sectorsize); | 666 | nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, |
667 | fs_info->sectorsize); | ||
668 | sums += csum_size * nr_sectors; | ||
639 | 669 | ||
640 | ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); | 670 | ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); |
641 | if (ret) { | 671 | if (ret) { |
@@ -643,7 +673,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, | |||
643 | bio_endio(comp_bio); | 673 | bio_endio(comp_bio); |
644 | } | 674 | } |
645 | 675 | ||
646 | comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); | 676 | comp_bio = btrfs_bio_alloc(cur_disk_byte); |
677 | bio_set_dev(comp_bio, bdev); | ||
647 | comp_bio->bi_opf = REQ_OP_READ; | 678 | comp_bio->bi_opf = REQ_OP_READ; |
648 | comp_bio->bi_private = cb; | 679 | comp_bio->bi_private = cb; |
649 | comp_bio->bi_end_io = end_compressed_bio_read; | 680 | comp_bio->bi_end_io = end_compressed_bio_read; |
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9976fe0f7526..2035b8eb1290 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h | |||
@@ -61,7 +61,7 @@ struct compressed_bio { | |||
61 | * the start of a variable length array of checksums only | 61 | * the start of a variable length array of checksums only |
62 | * used by reads | 62 | * used by reads |
63 | */ | 63 | */ |
64 | u32 sums; | 64 | u8 sums[]; |
65 | }; | 65 | }; |
66 | 66 | ||
67 | static inline unsigned int btrfs_compress_type(unsigned int type_level) | 67 | static inline unsigned int btrfs_compress_type(unsigned int type_level) |
@@ -173,6 +173,7 @@ extern const struct btrfs_compress_op btrfs_lzo_compress; | |||
173 | extern const struct btrfs_compress_op btrfs_zstd_compress; | 173 | extern const struct btrfs_compress_op btrfs_zstd_compress; |
174 | 174 | ||
175 | const char* btrfs_compress_type2str(enum btrfs_compression_type type); | 175 | const char* btrfs_compress_type2str(enum btrfs_compression_type type); |
176 | bool btrfs_compress_is_valid_type(const char *str, size_t len); | ||
176 | 177 | ||
177 | int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); | 178 | int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); |
178 | 179 | ||
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0a61dff27f57..299e11e6c554 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/kobject.h> | 19 | #include <linux/kobject.h> |
20 | #include <trace/events/btrfs.h> | 20 | #include <trace/events/btrfs.h> |
21 | #include <asm/kmap_types.h> | 21 | #include <asm/kmap_types.h> |
22 | #include <asm/unaligned.h> | ||
22 | #include <linux/pagemap.h> | 23 | #include <linux/pagemap.h> |
23 | #include <linux/btrfs.h> | 24 | #include <linux/btrfs.h> |
24 | #include <linux/btrfs_tree.h> | 25 | #include <linux/btrfs_tree.h> |
@@ -31,11 +32,13 @@ | |||
31 | #include "extent_io.h" | 32 | #include "extent_io.h" |
32 | #include "extent_map.h" | 33 | #include "extent_map.h" |
33 | #include "async-thread.h" | 34 | #include "async-thread.h" |
35 | #include "block-rsv.h" | ||
34 | 36 | ||
35 | struct btrfs_trans_handle; | 37 | struct btrfs_trans_handle; |
36 | struct btrfs_transaction; | 38 | struct btrfs_transaction; |
37 | struct btrfs_pending_snapshot; | 39 | struct btrfs_pending_snapshot; |
38 | struct btrfs_delayed_ref_root; | 40 | struct btrfs_delayed_ref_root; |
41 | struct btrfs_space_info; | ||
39 | extern struct kmem_cache *btrfs_trans_handle_cachep; | 42 | extern struct kmem_cache *btrfs_trans_handle_cachep; |
40 | extern struct kmem_cache *btrfs_bit_radix_cachep; | 43 | extern struct kmem_cache *btrfs_bit_radix_cachep; |
41 | extern struct kmem_cache *btrfs_path_cachep; | 44 | extern struct kmem_cache *btrfs_path_cachep; |
@@ -45,7 +48,16 @@ struct btrfs_ref; | |||
45 | 48 | ||
46 | #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ | 49 | #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ |
47 | 50 | ||
48 | #define BTRFS_MAX_MIRRORS 3 | 51 | /* |
52 | * Maximum number of mirrors that can be available for all profiles counting | ||
53 | * the target device of dev-replace as one. During an active device replace | ||
54 | * procedure, the target device of the copy operation is a mirror for the | ||
55 | * filesystem data as well that can be used to read data in order to repair | ||
56 | * read errors on other disks. | ||
57 | * | ||
58 | * Current value is derived from RAID1 with 2 copies. | ||
59 | */ | ||
60 | #define BTRFS_MAX_MIRRORS (2 + 1) | ||
49 | 61 | ||
50 | #define BTRFS_MAX_LEVEL 8 | 62 | #define BTRFS_MAX_LEVEL 8 |
51 | 63 | ||
@@ -72,6 +84,7 @@ struct btrfs_ref; | |||
72 | 84 | ||
73 | /* four bytes for CRC32 */ | 85 | /* four bytes for CRC32 */ |
74 | static const int btrfs_csum_sizes[] = { 4 }; | 86 | static const int btrfs_csum_sizes[] = { 4 }; |
87 | static const char *btrfs_csum_names[] = { "crc32c" }; | ||
75 | 88 | ||
76 | #define BTRFS_EMPTY_DIR_SIZE 0 | 89 | #define BTRFS_EMPTY_DIR_SIZE 0 |
77 | 90 | ||
@@ -99,10 +112,6 @@ static inline u32 count_max_extents(u64 size) | |||
99 | return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); | 112 | return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); |
100 | } | 113 | } |
101 | 114 | ||
102 | struct btrfs_mapping_tree { | ||
103 | struct extent_map_tree map_tree; | ||
104 | }; | ||
105 | |||
106 | static inline unsigned long btrfs_chunk_item_size(int num_stripes) | 115 | static inline unsigned long btrfs_chunk_item_size(int num_stripes) |
107 | { | 116 | { |
108 | BUG_ON(num_stripes == 0); | 117 | BUG_ON(num_stripes == 0); |
@@ -395,115 +404,6 @@ struct raid_kobject { | |||
395 | struct list_head list; | 404 | struct list_head list; |
396 | }; | 405 | }; |
397 | 406 | ||
398 | struct btrfs_space_info { | ||
399 | spinlock_t lock; | ||
400 | |||
401 | u64 total_bytes; /* total bytes in the space, | ||
402 | this doesn't take mirrors into account */ | ||
403 | u64 bytes_used; /* total bytes used, | ||
404 | this doesn't take mirrors into account */ | ||
405 | u64 bytes_pinned; /* total bytes pinned, will be freed when the | ||
406 | transaction finishes */ | ||
407 | u64 bytes_reserved; /* total bytes the allocator has reserved for | ||
408 | current allocations */ | ||
409 | u64 bytes_may_use; /* number of bytes that may be used for | ||
410 | delalloc/allocations */ | ||
411 | u64 bytes_readonly; /* total bytes that are read only */ | ||
412 | |||
413 | u64 max_extent_size; /* This will hold the maximum extent size of | ||
414 | the space info if we had an ENOSPC in the | ||
415 | allocator. */ | ||
416 | |||
417 | unsigned int full:1; /* indicates that we cannot allocate any more | ||
418 | chunks for this space */ | ||
419 | unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ | ||
420 | |||
421 | unsigned int flush:1; /* set if we are trying to make space */ | ||
422 | |||
423 | unsigned int force_alloc; /* set if we need to force a chunk | ||
424 | alloc for this space */ | ||
425 | |||
426 | u64 disk_used; /* total bytes used on disk */ | ||
427 | u64 disk_total; /* total bytes on disk, takes mirrors into | ||
428 | account */ | ||
429 | |||
430 | u64 flags; | ||
431 | |||
432 | /* | ||
433 | * bytes_pinned is kept in line with what is actually pinned, as in | ||
434 | * we've called update_block_group and dropped the bytes_used counter | ||
435 | * and increased the bytes_pinned counter. However this means that | ||
436 | * bytes_pinned does not reflect the bytes that will be pinned once the | ||
437 | * delayed refs are flushed, so this counter is inc'ed every time we | ||
438 | * call btrfs_free_extent so it is a realtime count of what will be | ||
439 | * freed once the transaction is committed. It will be zeroed every | ||
440 | * time the transaction commits. | ||
441 | */ | ||
442 | struct percpu_counter total_bytes_pinned; | ||
443 | |||
444 | struct list_head list; | ||
445 | /* Protected by the spinlock 'lock'. */ | ||
446 | struct list_head ro_bgs; | ||
447 | struct list_head priority_tickets; | ||
448 | struct list_head tickets; | ||
449 | /* | ||
450 | * tickets_id just indicates the next ticket will be handled, so note | ||
451 | * it's not stored per ticket. | ||
452 | */ | ||
453 | u64 tickets_id; | ||
454 | |||
455 | struct rw_semaphore groups_sem; | ||
456 | /* for block groups in our same type */ | ||
457 | struct list_head block_groups[BTRFS_NR_RAID_TYPES]; | ||
458 | wait_queue_head_t wait; | ||
459 | |||
460 | struct kobject kobj; | ||
461 | struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; | ||
462 | }; | ||
463 | |||
464 | /* | ||
465 | * Types of block reserves | ||
466 | */ | ||
467 | enum { | ||
468 | BTRFS_BLOCK_RSV_GLOBAL, | ||
469 | BTRFS_BLOCK_RSV_DELALLOC, | ||
470 | BTRFS_BLOCK_RSV_TRANS, | ||
471 | BTRFS_BLOCK_RSV_CHUNK, | ||
472 | BTRFS_BLOCK_RSV_DELOPS, | ||
473 | BTRFS_BLOCK_RSV_DELREFS, | ||
474 | BTRFS_BLOCK_RSV_EMPTY, | ||
475 | BTRFS_BLOCK_RSV_TEMP, | ||
476 | }; | ||
477 | |||
478 | struct btrfs_block_rsv { | ||
479 | u64 size; | ||
480 | u64 reserved; | ||
481 | struct btrfs_space_info *space_info; | ||
482 | spinlock_t lock; | ||
483 | unsigned short full; | ||
484 | unsigned short type; | ||
485 | unsigned short failfast; | ||
486 | |||
487 | /* | ||
488 | * Qgroup equivalent for @size @reserved | ||
489 | * | ||
490 | * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care | ||
491 | * about things like csum size nor how many tree blocks it will need to | ||
492 | * reserve. | ||
493 | * | ||
494 | * Qgroup cares more about net change of the extent usage. | ||
495 | * | ||
496 | * So for one newly inserted file extent, in worst case it will cause | ||
497 | * leaf split and level increase, nodesize for each file extent is | ||
498 | * already too much. | ||
499 | * | ||
500 | * In short, qgroup_size/reserved is the upper limit of possible needed | ||
501 | * qgroup metadata reservation. | ||
502 | */ | ||
503 | u64 qgroup_rsv_size; | ||
504 | u64 qgroup_rsv_reserved; | ||
505 | }; | ||
506 | |||
507 | /* | 407 | /* |
508 | * free clusters are used to claim free space in relatively large chunks, | 408 | * free clusters are used to claim free space in relatively large chunks, |
509 | * allowing us to do less seeky writes. They are used for all metadata | 409 | * allowing us to do less seeky writes. They are used for all metadata |
@@ -786,11 +686,18 @@ enum { | |||
786 | /* | 686 | /* |
787 | * Indicate that balance has been set up from the ioctl and is in the | 687 | * Indicate that balance has been set up from the ioctl and is in the |
788 | * main phase. The fs_info::balance_ctl is initialized. | 688 | * main phase. The fs_info::balance_ctl is initialized. |
689 | * Set and cleared while holding fs_info::balance_mutex. | ||
789 | */ | 690 | */ |
790 | BTRFS_FS_BALANCE_RUNNING, | 691 | BTRFS_FS_BALANCE_RUNNING, |
791 | 692 | ||
792 | /* Indicate that the cleaner thread is awake and doing something. */ | 693 | /* Indicate that the cleaner thread is awake and doing something. */ |
793 | BTRFS_FS_CLEANER_RUNNING, | 694 | BTRFS_FS_CLEANER_RUNNING, |
695 | |||
696 | /* | ||
697 | * The checksumming has an optimized version and is considered fast, | ||
698 | * so we don't need to offload checksums to workqueues. | ||
699 | */ | ||
700 | BTRFS_FS_CSUM_IMPL_FAST, | ||
794 | }; | 701 | }; |
795 | 702 | ||
796 | struct btrfs_fs_info { | 703 | struct btrfs_fs_info { |
@@ -824,7 +731,7 @@ struct btrfs_fs_info { | |||
824 | struct extent_io_tree *pinned_extents; | 731 | struct extent_io_tree *pinned_extents; |
825 | 732 | ||
826 | /* logical->physical extent mapping */ | 733 | /* logical->physical extent mapping */ |
827 | struct btrfs_mapping_tree mapping_tree; | 734 | struct extent_map_tree mapping_tree; |
828 | 735 | ||
829 | /* | 736 | /* |
830 | * block reservation for extent, checksum, root tree and | 737 | * block reservation for extent, checksum, root tree and |
@@ -1160,6 +1067,14 @@ struct btrfs_fs_info { | |||
1160 | spinlock_t swapfile_pins_lock; | 1067 | spinlock_t swapfile_pins_lock; |
1161 | struct rb_root swapfile_pins; | 1068 | struct rb_root swapfile_pins; |
1162 | 1069 | ||
1070 | struct crypto_shash *csum_shash; | ||
1071 | |||
1072 | /* | ||
1073 | * Number of send operations in progress. | ||
1074 | * Updated while holding fs_info::balance_mutex. | ||
1075 | */ | ||
1076 | int send_in_progress; | ||
1077 | |||
1163 | #ifdef CONFIG_BTRFS_FS_REF_VERIFY | 1078 | #ifdef CONFIG_BTRFS_FS_REF_VERIFY |
1164 | spinlock_t ref_verify_lock; | 1079 | spinlock_t ref_verify_lock; |
1165 | struct rb_root block_tree; | 1080 | struct rb_root block_tree; |
@@ -2451,6 +2366,11 @@ static inline int btrfs_super_csum_size(const struct btrfs_super_block *s) | |||
2451 | return btrfs_csum_sizes[t]; | 2366 | return btrfs_csum_sizes[t]; |
2452 | } | 2367 | } |
2453 | 2368 | ||
2369 | static inline const char *btrfs_super_csum_name(u16 csum_type) | ||
2370 | { | ||
2371 | /* csum type is validated at mount time */ | ||
2372 | return btrfs_csum_names[csum_type]; | ||
2373 | } | ||
2454 | 2374 | ||
2455 | /* | 2375 | /* |
2456 | * The leaf data grows from end-to-front in the node. | 2376 | * The leaf data grows from end-to-front in the node. |
@@ -2642,6 +2562,16 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, | |||
2642 | ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ | 2562 | ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ |
2643 | btrfs_item_offset_nr(leaf, slot))) | 2563 | btrfs_item_offset_nr(leaf, slot))) |
2644 | 2564 | ||
2565 | static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) | ||
2566 | { | ||
2567 | return crc32c(crc, address, length); | ||
2568 | } | ||
2569 | |||
2570 | static inline void btrfs_crc32c_final(u32 crc, u8 *result) | ||
2571 | { | ||
2572 | put_unaligned_le32(~crc, result); | ||
2573 | } | ||
2574 | |||
2645 | static inline u64 btrfs_name_hash(const char *name, int len) | 2575 | static inline u64 btrfs_name_hash(const char *name, int len) |
2646 | { | 2576 | { |
2647 | return crc32c((u32)~1, name, len); | 2577 | return crc32c((u32)~1, name, len); |
@@ -2656,12 +2586,6 @@ static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, | |||
2656 | return (u64) crc32c(parent_objectid, name, len); | 2586 | return (u64) crc32c(parent_objectid, name, len); |
2657 | } | 2587 | } |
2658 | 2588 | ||
2659 | static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) | ||
2660 | { | ||
2661 | return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && | ||
2662 | (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); | ||
2663 | } | ||
2664 | |||
2665 | static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) | 2589 | static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) |
2666 | { | 2590 | { |
2667 | return mapping_gfp_constraint(mapping, ~__GFP_FS); | 2591 | return mapping_gfp_constraint(mapping, ~__GFP_FS); |
@@ -2698,8 +2622,6 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, | |||
2698 | return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; | 2622 | return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; |
2699 | } | 2623 | } |
2700 | 2624 | ||
2701 | int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); | ||
2702 | bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); | ||
2703 | void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, | 2625 | void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, |
2704 | const u64 start); | 2626 | const u64 start); |
2705 | void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); | 2627 | void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); |
@@ -2814,17 +2736,28 @@ enum btrfs_flush_state { | |||
2814 | COMMIT_TRANS = 9, | 2736 | COMMIT_TRANS = 9, |
2815 | }; | 2737 | }; |
2816 | 2738 | ||
2817 | int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); | 2739 | /* |
2818 | int btrfs_check_data_free_space(struct inode *inode, | 2740 | * control flags for do_chunk_alloc's force field |
2819 | struct extent_changeset **reserved, u64 start, u64 len); | 2741 | * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk |
2820 | void btrfs_free_reserved_data_space(struct inode *inode, | 2742 | * if we really need one. |
2821 | struct extent_changeset *reserved, u64 start, u64 len); | 2743 | * |
2822 | void btrfs_delalloc_release_space(struct inode *inode, | 2744 | * CHUNK_ALLOC_LIMITED means to only try and allocate one |
2823 | struct extent_changeset *reserved, | 2745 | * if we have very few chunks already allocated. This is |
2824 | u64 start, u64 len, bool qgroup_free); | 2746 | * used as part of the clustering code to help make sure |
2825 | void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, | 2747 | * we have a good pool of storage to cluster in, without |
2826 | u64 len); | 2748 | * filling the FS with empty chunks |
2827 | void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); | 2749 | * |
2750 | * CHUNK_ALLOC_FORCE means it must try to allocate one | ||
2751 | * | ||
2752 | */ | ||
2753 | enum btrfs_chunk_alloc_enum { | ||
2754 | CHUNK_ALLOC_NO_FORCE, | ||
2755 | CHUNK_ALLOC_LIMITED, | ||
2756 | CHUNK_ALLOC_FORCE, | ||
2757 | }; | ||
2758 | |||
2759 | int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, | ||
2760 | enum btrfs_chunk_alloc_enum force); | ||
2828 | int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, | 2761 | int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, |
2829 | struct btrfs_block_rsv *rsv, | 2762 | struct btrfs_block_rsv *rsv, |
2830 | int nitems, bool use_global_rsv); | 2763 | int nitems, bool use_global_rsv); |
@@ -2834,41 +2767,6 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, | |||
2834 | bool qgroup_free); | 2767 | bool qgroup_free); |
2835 | 2768 | ||
2836 | int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); | 2769 | int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); |
2837 | void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, | ||
2838 | bool qgroup_free); | ||
2839 | int btrfs_delalloc_reserve_space(struct inode *inode, | ||
2840 | struct extent_changeset **reserved, u64 start, u64 len); | ||
2841 | void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); | ||
2842 | struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, | ||
2843 | unsigned short type); | ||
2844 | void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, | ||
2845 | struct btrfs_block_rsv *rsv, | ||
2846 | unsigned short type); | ||
2847 | void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, | ||
2848 | struct btrfs_block_rsv *rsv); | ||
2849 | int btrfs_block_rsv_add(struct btrfs_root *root, | ||
2850 | struct btrfs_block_rsv *block_rsv, u64 num_bytes, | ||
2851 | enum btrfs_reserve_flush_enum flush); | ||
2852 | int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); | ||
2853 | int btrfs_block_rsv_refill(struct btrfs_root *root, | ||
2854 | struct btrfs_block_rsv *block_rsv, u64 min_reserved, | ||
2855 | enum btrfs_reserve_flush_enum flush); | ||
2856 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, | ||
2857 | struct btrfs_block_rsv *dst_rsv, u64 num_bytes, | ||
2858 | bool update_size); | ||
2859 | int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, | ||
2860 | struct btrfs_block_rsv *dest, u64 num_bytes, | ||
2861 | int min_factor); | ||
2862 | void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, | ||
2863 | struct btrfs_block_rsv *block_rsv, | ||
2864 | u64 num_bytes); | ||
2865 | void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); | ||
2866 | void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); | ||
2867 | int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, | ||
2868 | enum btrfs_reserve_flush_enum flush); | ||
2869 | void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, | ||
2870 | struct btrfs_block_rsv *src, | ||
2871 | u64 num_bytes); | ||
2872 | int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); | 2770 | int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); |
2873 | void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); | 2771 | void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); |
2874 | void btrfs_put_block_group_cache(struct btrfs_fs_info *info); | 2772 | void btrfs_put_block_group_cache(struct btrfs_fs_info *info); |
@@ -3186,7 +3084,8 @@ int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, | |||
3186 | struct btrfs_dio_private; | 3084 | struct btrfs_dio_private; |
3187 | int btrfs_del_csums(struct btrfs_trans_handle *trans, | 3085 | int btrfs_del_csums(struct btrfs_trans_handle *trans, |
3188 | struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); | 3086 | struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); |
3189 | blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); | 3087 | blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, |
3088 | u8 *dst); | ||
3190 | blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, | 3089 | blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, |
3191 | u64 logical_offset); | 3090 | u64 logical_offset); |
3192 | int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, | 3091 | int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, |
@@ -3514,8 +3413,7 @@ __cold | |||
3514 | static inline void assfail(const char *expr, const char *file, int line) | 3413 | static inline void assfail(const char *expr, const char *file, int line) |
3515 | { | 3414 | { |
3516 | if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { | 3415 | if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { |
3517 | pr_err("assertion failed: %s, file: %s, line: %d\n", | 3416 | pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); |
3518 | expr, file, line); | ||
3519 | BUG(); | 3417 | BUG(); |
3520 | } | 3418 | } |
3521 | } | 3419 | } |
@@ -3599,10 +3497,11 @@ do { \ | |||
3599 | /* compatibility and incompatibility defines */ | 3497 | /* compatibility and incompatibility defines */ |
3600 | 3498 | ||
3601 | #define btrfs_set_fs_incompat(__fs_info, opt) \ | 3499 | #define btrfs_set_fs_incompat(__fs_info, opt) \ |
3602 | __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) | 3500 | __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ |
3501 | #opt) | ||
3603 | 3502 | ||
3604 | static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, | 3503 | static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, |
3605 | u64 flag) | 3504 | u64 flag, const char* name) |
3606 | { | 3505 | { |
3607 | struct btrfs_super_block *disk_super; | 3506 | struct btrfs_super_block *disk_super; |
3608 | u64 features; | 3507 | u64 features; |
@@ -3615,18 +3514,20 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, | |||
3615 | if (!(features & flag)) { | 3514 | if (!(features & flag)) { |
3616 | features |= flag; | 3515 | features |= flag; |
3617 | btrfs_set_super_incompat_flags(disk_super, features); | 3516 | btrfs_set_super_incompat_flags(disk_super, features); |
3618 | btrfs_info(fs_info, "setting %llu feature flag", | 3517 | btrfs_info(fs_info, |
3619 | flag); | 3518 | "setting incompat feature flag for %s (0x%llx)", |
3519 | name, flag); | ||
3620 | } | 3520 | } |
3621 | spin_unlock(&fs_info->super_lock); | 3521 | spin_unlock(&fs_info->super_lock); |
3622 | } | 3522 | } |
3623 | } | 3523 | } |
3624 | 3524 | ||
3625 | #define btrfs_clear_fs_incompat(__fs_info, opt) \ | 3525 | #define btrfs_clear_fs_incompat(__fs_info, opt) \ |
3626 | __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) | 3526 | __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ |
3527 | #opt) | ||
3627 | 3528 | ||
3628 | static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, | 3529 | static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, |
3629 | u64 flag) | 3530 | u64 flag, const char* name) |
3630 | { | 3531 | { |
3631 | struct btrfs_super_block *disk_super; | 3532 | struct btrfs_super_block *disk_super; |
3632 | u64 features; | 3533 | u64 features; |
@@ -3639,8 +3540,9 @@ static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, | |||
3639 | if (features & flag) { | 3540 | if (features & flag) { |
3640 | features &= ~flag; | 3541 | features &= ~flag; |
3641 | btrfs_set_super_incompat_flags(disk_super, features); | 3542 | btrfs_set_super_incompat_flags(disk_super, features); |
3642 | btrfs_info(fs_info, "clearing %llu feature flag", | 3543 | btrfs_info(fs_info, |
3643 | flag); | 3544 | "clearing incompat feature flag for %s (0x%llx)", |
3545 | name, flag); | ||
3644 | } | 3546 | } |
3645 | spin_unlock(&fs_info->super_lock); | 3547 | spin_unlock(&fs_info->super_lock); |
3646 | } | 3548 | } |
@@ -3657,10 +3559,11 @@ static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) | |||
3657 | } | 3559 | } |
3658 | 3560 | ||
3659 | #define btrfs_set_fs_compat_ro(__fs_info, opt) \ | 3561 | #define btrfs_set_fs_compat_ro(__fs_info, opt) \ |
3660 | __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) | 3562 | __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ |
3563 | #opt) | ||
3661 | 3564 | ||
3662 | static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, | 3565 | static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, |
3663 | u64 flag) | 3566 | u64 flag, const char *name) |
3664 | { | 3567 | { |
3665 | struct btrfs_super_block *disk_super; | 3568 | struct btrfs_super_block *disk_super; |
3666 | u64 features; | 3569 | u64 features; |
@@ -3673,18 +3576,20 @@ static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, | |||
3673 | if (!(features & flag)) { | 3576 | if (!(features & flag)) { |
3674 | features |= flag; | 3577 | features |= flag; |
3675 | btrfs_set_super_compat_ro_flags(disk_super, features); | 3578 | btrfs_set_super_compat_ro_flags(disk_super, features); |
3676 | btrfs_info(fs_info, "setting %llu ro feature flag", | 3579 | btrfs_info(fs_info, |
3677 | flag); | 3580 | "setting compat-ro feature flag for %s (0x%llx)", |
3581 | name, flag); | ||
3678 | } | 3582 | } |
3679 | spin_unlock(&fs_info->super_lock); | 3583 | spin_unlock(&fs_info->super_lock); |
3680 | } | 3584 | } |
3681 | } | 3585 | } |
3682 | 3586 | ||
3683 | #define btrfs_clear_fs_compat_ro(__fs_info, opt) \ | 3587 | #define btrfs_clear_fs_compat_ro(__fs_info, opt) \ |
3684 | __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) | 3588 | __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ |
3589 | #opt) | ||
3685 | 3590 | ||
3686 | static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, | 3591 | static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, |
3687 | u64 flag) | 3592 | u64 flag, const char *name) |
3688 | { | 3593 | { |
3689 | struct btrfs_super_block *disk_super; | 3594 | struct btrfs_super_block *disk_super; |
3690 | u64 features; | 3595 | u64 features; |
@@ -3697,8 +3602,9 @@ static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, | |||
3697 | if (features & flag) { | 3602 | if (features & flag) { |
3698 | features &= ~flag; | 3603 | features &= ~flag; |
3699 | btrfs_set_super_compat_ro_flags(disk_super, features); | 3604 | btrfs_set_super_compat_ro_flags(disk_super, features); |
3700 | btrfs_info(fs_info, "clearing %llu ro feature flag", | 3605 | btrfs_info(fs_info, |
3701 | flag); | 3606 | "clearing compat-ro feature flag for %s (0x%llx)", |
3607 | name, flag); | ||
3702 | } | 3608 | } |
3703 | spin_unlock(&fs_info->super_lock); | 3609 | spin_unlock(&fs_info->super_lock); |
3704 | } | 3610 | } |
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c new file mode 100644 index 000000000000..17f7c0d38768 --- /dev/null +++ b/fs/btrfs/delalloc-space.c | |||
@@ -0,0 +1,494 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | #include "ctree.h" | ||
4 | #include "delalloc-space.h" | ||
5 | #include "block-rsv.h" | ||
6 | #include "btrfs_inode.h" | ||
7 | #include "space-info.h" | ||
8 | #include "transaction.h" | ||
9 | #include "qgroup.h" | ||
10 | |||
11 | int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) | ||
12 | { | ||
13 | struct btrfs_root *root = inode->root; | ||
14 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
15 | struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; | ||
16 | u64 used; | ||
17 | int ret = 0; | ||
18 | int need_commit = 2; | ||
19 | int have_pinned_space; | ||
20 | |||
21 | /* Make sure bytes are sectorsize aligned */ | ||
22 | bytes = ALIGN(bytes, fs_info->sectorsize); | ||
23 | |||
24 | if (btrfs_is_free_space_inode(inode)) { | ||
25 | need_commit = 0; | ||
26 | ASSERT(current->journal_info); | ||
27 | } | ||
28 | |||
29 | again: | ||
30 | /* Make sure we have enough space to handle the data first */ | ||
31 | spin_lock(&data_sinfo->lock); | ||
32 | used = btrfs_space_info_used(data_sinfo, true); | ||
33 | |||
34 | if (used + bytes > data_sinfo->total_bytes) { | ||
35 | struct btrfs_trans_handle *trans; | ||
36 | |||
37 | /* | ||
38 | * If we don't have enough free bytes in this space then we need | ||
39 | * to alloc a new chunk. | ||
40 | */ | ||
41 | if (!data_sinfo->full) { | ||
42 | u64 alloc_target; | ||
43 | |||
44 | data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; | ||
45 | spin_unlock(&data_sinfo->lock); | ||
46 | |||
47 | alloc_target = btrfs_data_alloc_profile(fs_info); | ||
48 | /* | ||
49 | * It is ugly that we don't call nolock join | ||
50 | * transaction for the free space inode case here. | ||
51 | * But it is safe because we only do the data space | ||
52 | * reservation for the free space cache in the | ||
53 | * transaction context, the common join transaction | ||
54 | * just increase the counter of the current transaction | ||
55 | * handler, doesn't try to acquire the trans_lock of | ||
56 | * the fs. | ||
57 | */ | ||
58 | trans = btrfs_join_transaction(root); | ||
59 | if (IS_ERR(trans)) | ||
60 | return PTR_ERR(trans); | ||
61 | |||
62 | ret = btrfs_chunk_alloc(trans, alloc_target, | ||
63 | CHUNK_ALLOC_NO_FORCE); | ||
64 | btrfs_end_transaction(trans); | ||
65 | if (ret < 0) { | ||
66 | if (ret != -ENOSPC) | ||
67 | return ret; | ||
68 | else { | ||
69 | have_pinned_space = 1; | ||
70 | goto commit_trans; | ||
71 | } | ||
72 | } | ||
73 | |||
74 | goto again; | ||
75 | } | ||
76 | |||
77 | /* | ||
78 | * If we don't have enough pinned space to deal with this | ||
79 | * allocation, and no removed chunk in current transaction, | ||
80 | * don't bother committing the transaction. | ||
81 | */ | ||
82 | have_pinned_space = __percpu_counter_compare( | ||
83 | &data_sinfo->total_bytes_pinned, | ||
84 | used + bytes - data_sinfo->total_bytes, | ||
85 | BTRFS_TOTAL_BYTES_PINNED_BATCH); | ||
86 | spin_unlock(&data_sinfo->lock); | ||
87 | |||
88 | /* Commit the current transaction and try again */ | ||
89 | commit_trans: | ||
90 | if (need_commit) { | ||
91 | need_commit--; | ||
92 | |||
93 | if (need_commit > 0) { | ||
94 | btrfs_start_delalloc_roots(fs_info, -1); | ||
95 | btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, | ||
96 | (u64)-1); | ||
97 | } | ||
98 | |||
99 | trans = btrfs_join_transaction(root); | ||
100 | if (IS_ERR(trans)) | ||
101 | return PTR_ERR(trans); | ||
102 | if (have_pinned_space >= 0 || | ||
103 | test_bit(BTRFS_TRANS_HAVE_FREE_BGS, | ||
104 | &trans->transaction->flags) || | ||
105 | need_commit > 0) { | ||
106 | ret = btrfs_commit_transaction(trans); | ||
107 | if (ret) | ||
108 | return ret; | ||
109 | /* | ||
110 | * The cleaner kthread might still be doing iput | ||
111 | * operations. Wait for it to finish so that | ||
112 | * more space is released. We don't need to | ||
113 | * explicitly run the delayed iputs here because | ||
114 | * the commit_transaction would have woken up | ||
115 | * the cleaner. | ||
116 | */ | ||
117 | ret = btrfs_wait_on_delayed_iputs(fs_info); | ||
118 | if (ret) | ||
119 | return ret; | ||
120 | goto again; | ||
121 | } else { | ||
122 | btrfs_end_transaction(trans); | ||
123 | } | ||
124 | } | ||
125 | |||
126 | trace_btrfs_space_reservation(fs_info, | ||
127 | "space_info:enospc", | ||
128 | data_sinfo->flags, bytes, 1); | ||
129 | return -ENOSPC; | ||
130 | } | ||
131 | btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); | ||
132 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
133 | data_sinfo->flags, bytes, 1); | ||
134 | spin_unlock(&data_sinfo->lock); | ||
135 | |||
136 | return 0; | ||
137 | } | ||
138 | |||
139 | int btrfs_check_data_free_space(struct inode *inode, | ||
140 | struct extent_changeset **reserved, u64 start, u64 len) | ||
141 | { | ||
142 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); | ||
143 | int ret; | ||
144 | |||
145 | /* align the range */ | ||
146 | len = round_up(start + len, fs_info->sectorsize) - | ||
147 | round_down(start, fs_info->sectorsize); | ||
148 | start = round_down(start, fs_info->sectorsize); | ||
149 | |||
150 | ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); | ||
151 | if (ret < 0) | ||
152 | return ret; | ||
153 | |||
154 | /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ | ||
155 | ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); | ||
156 | if (ret < 0) | ||
157 | btrfs_free_reserved_data_space_noquota(inode, start, len); | ||
158 | else | ||
159 | ret = 0; | ||
160 | return ret; | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * Called if we need to clear a data reservation for this inode | ||
165 | * Normally in a error case. | ||
166 | * | ||
167 | * This one will *NOT* use accurate qgroup reserved space API, just for case | ||
168 | * which we can't sleep and is sure it won't affect qgroup reserved space. | ||
169 | * Like clear_bit_hook(). | ||
170 | */ | ||
171 | void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, | ||
172 | u64 len) | ||
173 | { | ||
174 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); | ||
175 | struct btrfs_space_info *data_sinfo; | ||
176 | |||
177 | /* Make sure the range is aligned to sectorsize */ | ||
178 | len = round_up(start + len, fs_info->sectorsize) - | ||
179 | round_down(start, fs_info->sectorsize); | ||
180 | start = round_down(start, fs_info->sectorsize); | ||
181 | |||
182 | data_sinfo = fs_info->data_sinfo; | ||
183 | spin_lock(&data_sinfo->lock); | ||
184 | btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); | ||
185 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
186 | data_sinfo->flags, len, 0); | ||
187 | spin_unlock(&data_sinfo->lock); | ||
188 | } | ||
189 | |||
190 | /* | ||
191 | * Called if we need to clear a data reservation for this inode | ||
192 | * Normally in a error case. | ||
193 | * | ||
194 | * This one will handle the per-inode data rsv map for accurate reserved | ||
195 | * space framework. | ||
196 | */ | ||
197 | void btrfs_free_reserved_data_space(struct inode *inode, | ||
198 | struct extent_changeset *reserved, u64 start, u64 len) | ||
199 | { | ||
200 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
201 | |||
202 | /* Make sure the range is aligned to sectorsize */ | ||
203 | len = round_up(start + len, root->fs_info->sectorsize) - | ||
204 | round_down(start, root->fs_info->sectorsize); | ||
205 | start = round_down(start, root->fs_info->sectorsize); | ||
206 | |||
207 | btrfs_free_reserved_data_space_noquota(inode, start, len); | ||
208 | btrfs_qgroup_free_data(inode, reserved, start, len); | ||
209 | } | ||
210 | |||
211 | /** | ||
212 | * btrfs_inode_rsv_release - release any excessive reservation. | ||
213 | * @inode - the inode we need to release from. | ||
214 | * @qgroup_free - free or convert qgroup meta. | ||
215 | * Unlike normal operation, qgroup meta reservation needs to know if we are | ||
216 | * freeing qgroup reservation or just converting it into per-trans. Normally | ||
217 | * @qgroup_free is true for error handling, and false for normal release. | ||
218 | * | ||
219 | * This is the same as btrfs_block_rsv_release, except that it handles the | ||
220 | * tracepoint for the reservation. | ||
221 | */ | ||
222 | static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) | ||
223 | { | ||
224 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||
225 | struct btrfs_block_rsv *block_rsv = &inode->block_rsv; | ||
226 | u64 released = 0; | ||
227 | u64 qgroup_to_release = 0; | ||
228 | |||
229 | /* | ||
230 | * Since we statically set the block_rsv->size we just want to say we | ||
231 | * are releasing 0 bytes, and then we'll just get the reservation over | ||
232 | * the size free'd. | ||
233 | */ | ||
234 | released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, | ||
235 | &qgroup_to_release); | ||
236 | if (released > 0) | ||
237 | trace_btrfs_space_reservation(fs_info, "delalloc", | ||
238 | btrfs_ino(inode), released, 0); | ||
239 | if (qgroup_free) | ||
240 | btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); | ||
241 | else | ||
242 | btrfs_qgroup_convert_reserved_meta(inode->root, | ||
243 | qgroup_to_release); | ||
244 | } | ||
245 | |||
246 | static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, | ||
247 | struct btrfs_inode *inode) | ||
248 | { | ||
249 | struct btrfs_block_rsv *block_rsv = &inode->block_rsv; | ||
250 | u64 reserve_size = 0; | ||
251 | u64 qgroup_rsv_size = 0; | ||
252 | u64 csum_leaves; | ||
253 | unsigned outstanding_extents; | ||
254 | |||
255 | lockdep_assert_held(&inode->lock); | ||
256 | outstanding_extents = inode->outstanding_extents; | ||
257 | if (outstanding_extents) | ||
258 | reserve_size = btrfs_calc_trans_metadata_size(fs_info, | ||
259 | outstanding_extents + 1); | ||
260 | csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, | ||
261 | inode->csum_bytes); | ||
262 | reserve_size += btrfs_calc_trans_metadata_size(fs_info, | ||
263 | csum_leaves); | ||
264 | /* | ||
265 | * For qgroup rsv, the calculation is very simple: | ||
266 | * account one nodesize for each outstanding extent | ||
267 | * | ||
268 | * This is overestimating in most cases. | ||
269 | */ | ||
270 | qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; | ||
271 | |||
272 | spin_lock(&block_rsv->lock); | ||
273 | block_rsv->size = reserve_size; | ||
274 | block_rsv->qgroup_rsv_size = qgroup_rsv_size; | ||
275 | spin_unlock(&block_rsv->lock); | ||
276 | } | ||
277 | |||
278 | static void calc_inode_reservations(struct btrfs_fs_info *fs_info, | ||
279 | u64 num_bytes, u64 *meta_reserve, | ||
280 | u64 *qgroup_reserve) | ||
281 | { | ||
282 | u64 nr_extents = count_max_extents(num_bytes); | ||
283 | u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); | ||
284 | |||
285 | /* We add one for the inode update at finish ordered time */ | ||
286 | *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, | ||
287 | nr_extents + csum_leaves + 1); | ||
288 | *qgroup_reserve = nr_extents * fs_info->nodesize; | ||
289 | } | ||
290 | |||
291 | int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) | ||
292 | { | ||
293 | struct btrfs_root *root = inode->root; | ||
294 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
295 | struct btrfs_block_rsv *block_rsv = &inode->block_rsv; | ||
296 | u64 meta_reserve, qgroup_reserve; | ||
297 | unsigned nr_extents; | ||
298 | enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; | ||
299 | int ret = 0; | ||
300 | bool delalloc_lock = true; | ||
301 | |||
302 | /* | ||
303 | * If we are a free space inode we need to not flush since we will be in | ||
304 | * the middle of a transaction commit. We also don't need the delalloc | ||
305 | * mutex since we won't race with anybody. We need this mostly to make | ||
306 | * lockdep shut its filthy mouth. | ||
307 | * | ||
308 | * If we have a transaction open (can happen if we call truncate_block | ||
309 | * from truncate), then we need FLUSH_LIMIT so we don't deadlock. | ||
310 | */ | ||
311 | if (btrfs_is_free_space_inode(inode)) { | ||
312 | flush = BTRFS_RESERVE_NO_FLUSH; | ||
313 | delalloc_lock = false; | ||
314 | } else { | ||
315 | if (current->journal_info) | ||
316 | flush = BTRFS_RESERVE_FLUSH_LIMIT; | ||
317 | |||
318 | if (btrfs_transaction_in_commit(fs_info)) | ||
319 | schedule_timeout(1); | ||
320 | } | ||
321 | |||
322 | if (delalloc_lock) | ||
323 | mutex_lock(&inode->delalloc_mutex); | ||
324 | |||
325 | num_bytes = ALIGN(num_bytes, fs_info->sectorsize); | ||
326 | |||
327 | /* | ||
328 | * We always want to do it this way, every other way is wrong and ends | ||
329 | * in tears. Pre-reserving the amount we are going to add will always | ||
330 | * be the right way, because otherwise if we have enough parallelism we | ||
331 | * could end up with thousands of inodes all holding little bits of | ||
332 | * reservations they were able to make previously and the only way to | ||
333 | * reclaim that space is to ENOSPC out the operations and clear | ||
334 | * everything out and try again, which is bad. This way we just | ||
335 | * over-reserve slightly, and clean up the mess when we are done. | ||
336 | */ | ||
337 | calc_inode_reservations(fs_info, num_bytes, &meta_reserve, | ||
338 | &qgroup_reserve); | ||
339 | ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); | ||
340 | if (ret) | ||
341 | goto out_fail; | ||
342 | ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); | ||
343 | if (ret) | ||
344 | goto out_qgroup; | ||
345 | |||
346 | /* | ||
347 | * Now we need to update our outstanding extents and csum bytes _first_ | ||
348 | * and then add the reservation to the block_rsv. This keeps us from | ||
349 | * racing with an ordered completion or some such that would think it | ||
350 | * needs to free the reservation we just made. | ||
351 | */ | ||
352 | spin_lock(&inode->lock); | ||
353 | nr_extents = count_max_extents(num_bytes); | ||
354 | btrfs_mod_outstanding_extents(inode, nr_extents); | ||
355 | inode->csum_bytes += num_bytes; | ||
356 | btrfs_calculate_inode_block_rsv_size(fs_info, inode); | ||
357 | spin_unlock(&inode->lock); | ||
358 | |||
359 | /* Now we can safely add our space to our block rsv */ | ||
360 | btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false); | ||
361 | trace_btrfs_space_reservation(root->fs_info, "delalloc", | ||
362 | btrfs_ino(inode), meta_reserve, 1); | ||
363 | |||
364 | spin_lock(&block_rsv->lock); | ||
365 | block_rsv->qgroup_rsv_reserved += qgroup_reserve; | ||
366 | spin_unlock(&block_rsv->lock); | ||
367 | |||
368 | if (delalloc_lock) | ||
369 | mutex_unlock(&inode->delalloc_mutex); | ||
370 | return 0; | ||
371 | out_qgroup: | ||
372 | btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); | ||
373 | out_fail: | ||
374 | btrfs_inode_rsv_release(inode, true); | ||
375 | if (delalloc_lock) | ||
376 | mutex_unlock(&inode->delalloc_mutex); | ||
377 | return ret; | ||
378 | } | ||
379 | |||
380 | /** | ||
381 | * btrfs_delalloc_release_metadata - release a metadata reservation for an inode | ||
382 | * @inode: the inode to release the reservation for. | ||
383 | * @num_bytes: the number of bytes we are releasing. | ||
384 | * @qgroup_free: free qgroup reservation or convert it to per-trans reservation | ||
385 | * | ||
386 | * This will release the metadata reservation for an inode. This can be called | ||
387 | * once we complete IO for a given set of bytes to release their metadata | ||
388 | * reservations, or on error for the same reason. | ||
389 | */ | ||
390 | void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, | ||
391 | bool qgroup_free) | ||
392 | { | ||
393 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||
394 | |||
395 | num_bytes = ALIGN(num_bytes, fs_info->sectorsize); | ||
396 | spin_lock(&inode->lock); | ||
397 | inode->csum_bytes -= num_bytes; | ||
398 | btrfs_calculate_inode_block_rsv_size(fs_info, inode); | ||
399 | spin_unlock(&inode->lock); | ||
400 | |||
401 | if (btrfs_is_testing(fs_info)) | ||
402 | return; | ||
403 | |||
404 | btrfs_inode_rsv_release(inode, qgroup_free); | ||
405 | } | ||
406 | |||
407 | /** | ||
408 | * btrfs_delalloc_release_extents - release our outstanding_extents | ||
409 | * @inode: the inode to balance the reservation for. | ||
410 | * @num_bytes: the number of bytes we originally reserved with | ||
411 | * @qgroup_free: do we need to free qgroup meta reservation or convert them. | ||
412 | * | ||
413 | * When we reserve space we increase outstanding_extents for the extents we may | ||
414 | * add. Once we've set the range as delalloc or created our ordered extents we | ||
415 | * have outstanding_extents to track the real usage, so we use this to free our | ||
416 | * temporarily tracked outstanding_extents. This _must_ be used in conjunction | ||
417 | * with btrfs_delalloc_reserve_metadata. | ||
418 | */ | ||
419 | void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, | ||
420 | bool qgroup_free) | ||
421 | { | ||
422 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||
423 | unsigned num_extents; | ||
424 | |||
425 | spin_lock(&inode->lock); | ||
426 | num_extents = count_max_extents(num_bytes); | ||
427 | btrfs_mod_outstanding_extents(inode, -num_extents); | ||
428 | btrfs_calculate_inode_block_rsv_size(fs_info, inode); | ||
429 | spin_unlock(&inode->lock); | ||
430 | |||
431 | if (btrfs_is_testing(fs_info)) | ||
432 | return; | ||
433 | |||
434 | btrfs_inode_rsv_release(inode, qgroup_free); | ||
435 | } | ||
436 | |||
437 | /** | ||
438 | * btrfs_delalloc_reserve_space - reserve data and metadata space for | ||
439 | * delalloc | ||
440 | * @inode: inode we're writing to | ||
441 | * @start: start range we are writing to | ||
442 | * @len: how long the range we are writing to | ||
443 | * @reserved: mandatory parameter, record actually reserved qgroup ranges of | ||
444 | * current reservation. | ||
445 | * | ||
446 | * This will do the following things | ||
447 | * | ||
448 | * - reserve space in data space info for num bytes | ||
449 | * and reserve precious corresponding qgroup space | ||
450 | * (Done in check_data_free_space) | ||
451 | * | ||
452 | * - reserve space for metadata space, based on the number of outstanding | ||
453 | * extents and how much csums will be needed | ||
454 | * also reserve metadata space in a per root over-reserve method. | ||
455 | * - add to the inodes->delalloc_bytes | ||
456 | * - add it to the fs_info's delalloc inodes list. | ||
457 | * (Above 3 all done in delalloc_reserve_metadata) | ||
458 | * | ||
459 | * Return 0 for success | ||
460 | * Return <0 for error(-ENOSPC or -EQUOT) | ||
461 | */ | ||
462 | int btrfs_delalloc_reserve_space(struct inode *inode, | ||
463 | struct extent_changeset **reserved, u64 start, u64 len) | ||
464 | { | ||
465 | int ret; | ||
466 | |||
467 | ret = btrfs_check_data_free_space(inode, reserved, start, len); | ||
468 | if (ret < 0) | ||
469 | return ret; | ||
470 | ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); | ||
471 | if (ret < 0) | ||
472 | btrfs_free_reserved_data_space(inode, *reserved, start, len); | ||
473 | return ret; | ||
474 | } | ||
475 | |||
476 | /** | ||
477 | * btrfs_delalloc_release_space - release data and metadata space for delalloc | ||
478 | * @inode: inode we're releasing space for | ||
479 | * @start: start position of the space already reserved | ||
480 | * @len: the len of the space already reserved | ||
481 | * @release_bytes: the len of the space we consumed or didn't use | ||
482 | * | ||
483 | * This function will release the metadata space that was not used and will | ||
484 | * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes | ||
485 | * list if there are no delalloc bytes left. | ||
486 | * Also it will handle the qgroup reserved space. | ||
487 | */ | ||
488 | void btrfs_delalloc_release_space(struct inode *inode, | ||
489 | struct extent_changeset *reserved, | ||
490 | u64 start, u64 len, bool qgroup_free) | ||
491 | { | ||
492 | btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); | ||
493 | btrfs_free_reserved_data_space(inode, reserved, start, len); | ||
494 | } | ||
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h new file mode 100644 index 000000000000..54466fbd7075 --- /dev/null +++ b/fs/btrfs/delalloc-space.h | |||
@@ -0,0 +1,23 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | |||
3 | #ifndef BTRFS_DELALLOC_SPACE_H | ||
4 | #define BTRFS_DELALLOC_SPACE_H | ||
5 | |||
6 | struct extent_changeset; | ||
7 | |||
8 | int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); | ||
9 | int btrfs_check_data_free_space(struct inode *inode, | ||
10 | struct extent_changeset **reserved, u64 start, u64 len); | ||
11 | void btrfs_free_reserved_data_space(struct inode *inode, | ||
12 | struct extent_changeset *reserved, u64 start, u64 len); | ||
13 | void btrfs_delalloc_release_space(struct inode *inode, | ||
14 | struct extent_changeset *reserved, | ||
15 | u64 start, u64 len, bool qgroup_free); | ||
16 | void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, | ||
17 | u64 len); | ||
18 | void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, | ||
19 | bool qgroup_free); | ||
20 | int btrfs_delalloc_reserve_space(struct inode *inode, | ||
21 | struct extent_changeset **reserved, u64 start, u64 len); | ||
22 | |||
23 | #endif /* BTRFS_DELALLOC_SPACE_H */ | ||
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index a73fc23e2961..9a91d1eb0af4 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include "delayed-ref.h" | 10 | #include "delayed-ref.h" |
11 | #include "transaction.h" | 11 | #include "transaction.h" |
12 | #include "qgroup.h" | 12 | #include "qgroup.h" |
13 | #include "space-info.h" | ||
13 | 14 | ||
14 | struct kmem_cache *btrfs_delayed_ref_head_cachep; | 15 | struct kmem_cache *btrfs_delayed_ref_head_cachep; |
15 | struct kmem_cache *btrfs_delayed_tree_ref_cachep; | 16 | struct kmem_cache *btrfs_delayed_tree_ref_cachep; |
@@ -24,6 +25,179 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep; | |||
24 | * of hammering updates on the extent allocation tree. | 25 | * of hammering updates on the extent allocation tree. |
25 | */ | 26 | */ |
26 | 27 | ||
28 | bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) | ||
29 | { | ||
30 | struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; | ||
31 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
32 | bool ret = false; | ||
33 | u64 reserved; | ||
34 | |||
35 | spin_lock(&global_rsv->lock); | ||
36 | reserved = global_rsv->reserved; | ||
37 | spin_unlock(&global_rsv->lock); | ||
38 | |||
39 | /* | ||
40 | * Since the global reserve is just kind of magic we don't really want | ||
41 | * to rely on it to save our bacon, so if our size is more than the | ||
42 | * delayed_refs_rsv and the global rsv then it's time to think about | ||
43 | * bailing. | ||
44 | */ | ||
45 | spin_lock(&delayed_refs_rsv->lock); | ||
46 | reserved += delayed_refs_rsv->reserved; | ||
47 | if (delayed_refs_rsv->size >= reserved) | ||
48 | ret = true; | ||
49 | spin_unlock(&delayed_refs_rsv->lock); | ||
50 | return ret; | ||
51 | } | ||
52 | |||
53 | int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) | ||
54 | { | ||
55 | u64 num_entries = | ||
56 | atomic_read(&trans->transaction->delayed_refs.num_entries); | ||
57 | u64 avg_runtime; | ||
58 | u64 val; | ||
59 | |||
60 | smp_mb(); | ||
61 | avg_runtime = trans->fs_info->avg_delayed_ref_runtime; | ||
62 | val = num_entries * avg_runtime; | ||
63 | if (val >= NSEC_PER_SEC) | ||
64 | return 1; | ||
65 | if (val >= NSEC_PER_SEC / 2) | ||
66 | return 2; | ||
67 | |||
68 | return btrfs_check_space_for_delayed_refs(trans->fs_info); | ||
69 | } | ||
70 | |||
71 | /** | ||
72 | * btrfs_delayed_refs_rsv_release - release a ref head's reservation. | ||
73 | * @fs_info - the fs_info for our fs. | ||
74 | * @nr - the number of items to drop. | ||
75 | * | ||
76 | * This drops the delayed ref head's count from the delayed refs rsv and frees | ||
77 | * any excess reservation we had. | ||
78 | */ | ||
79 | void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) | ||
80 | { | ||
81 | struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; | ||
82 | u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); | ||
83 | u64 released = 0; | ||
84 | |||
85 | released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, | ||
86 | NULL); | ||
87 | if (released) | ||
88 | trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", | ||
89 | 0, released, 0); | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv | ||
94 | * @trans - the trans that may have generated delayed refs | ||
95 | * | ||
96 | * This is to be called anytime we may have adjusted trans->delayed_ref_updates, | ||
97 | * it'll calculate the additional size and add it to the delayed_refs_rsv. | ||
98 | */ | ||
99 | void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) | ||
100 | { | ||
101 | struct btrfs_fs_info *fs_info = trans->fs_info; | ||
102 | struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; | ||
103 | u64 num_bytes; | ||
104 | |||
105 | if (!trans->delayed_ref_updates) | ||
106 | return; | ||
107 | |||
108 | num_bytes = btrfs_calc_trans_metadata_size(fs_info, | ||
109 | trans->delayed_ref_updates); | ||
110 | spin_lock(&delayed_rsv->lock); | ||
111 | delayed_rsv->size += num_bytes; | ||
112 | delayed_rsv->full = 0; | ||
113 | spin_unlock(&delayed_rsv->lock); | ||
114 | trans->delayed_ref_updates = 0; | ||
115 | } | ||
116 | |||
117 | /** | ||
118 | * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. | ||
119 | * @fs_info - the fs info for our fs. | ||
120 | * @src - the source block rsv to transfer from. | ||
121 | * @num_bytes - the number of bytes to transfer. | ||
122 | * | ||
123 | * This transfers up to the num_bytes amount from the src rsv to the | ||
124 | * delayed_refs_rsv. Any extra bytes are returned to the space info. | ||
125 | */ | ||
126 | void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, | ||
127 | struct btrfs_block_rsv *src, | ||
128 | u64 num_bytes) | ||
129 | { | ||
130 | struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; | ||
131 | u64 to_free = 0; | ||
132 | |||
133 | spin_lock(&src->lock); | ||
134 | src->reserved -= num_bytes; | ||
135 | src->size -= num_bytes; | ||
136 | spin_unlock(&src->lock); | ||
137 | |||
138 | spin_lock(&delayed_refs_rsv->lock); | ||
139 | if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { | ||
140 | u64 delta = delayed_refs_rsv->size - | ||
141 | delayed_refs_rsv->reserved; | ||
142 | if (num_bytes > delta) { | ||
143 | to_free = num_bytes - delta; | ||
144 | num_bytes = delta; | ||
145 | } | ||
146 | } else { | ||
147 | to_free = num_bytes; | ||
148 | num_bytes = 0; | ||
149 | } | ||
150 | |||
151 | if (num_bytes) | ||
152 | delayed_refs_rsv->reserved += num_bytes; | ||
153 | if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) | ||
154 | delayed_refs_rsv->full = 1; | ||
155 | spin_unlock(&delayed_refs_rsv->lock); | ||
156 | |||
157 | if (num_bytes) | ||
158 | trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", | ||
159 | 0, num_bytes, 1); | ||
160 | if (to_free) | ||
161 | btrfs_space_info_add_old_bytes(fs_info, | ||
162 | delayed_refs_rsv->space_info, to_free); | ||
163 | } | ||
164 | |||
165 | /** | ||
166 | * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. | ||
167 | * @fs_info - the fs_info for our fs. | ||
168 | * @flush - control how we can flush for this reservation. | ||
169 | * | ||
170 | * This will refill the delayed block_rsv up to 1 items size worth of space and | ||
171 | * will return -ENOSPC if we can't make the reservation. | ||
172 | */ | ||
173 | int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, | ||
174 | enum btrfs_reserve_flush_enum flush) | ||
175 | { | ||
176 | struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; | ||
177 | u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); | ||
178 | u64 num_bytes = 0; | ||
179 | int ret = -ENOSPC; | ||
180 | |||
181 | spin_lock(&block_rsv->lock); | ||
182 | if (block_rsv->reserved < block_rsv->size) { | ||
183 | num_bytes = block_rsv->size - block_rsv->reserved; | ||
184 | num_bytes = min(num_bytes, limit); | ||
185 | } | ||
186 | spin_unlock(&block_rsv->lock); | ||
187 | |||
188 | if (!num_bytes) | ||
189 | return 0; | ||
190 | |||
191 | ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv, | ||
192 | num_bytes, flush); | ||
193 | if (ret) | ||
194 | return ret; | ||
195 | btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); | ||
196 | trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", | ||
197 | 0, num_bytes, 1); | ||
198 | return 0; | ||
199 | } | ||
200 | |||
27 | /* | 201 | /* |
28 | * compare two delayed tree backrefs with same bytenr and type | 202 | * compare two delayed tree backrefs with same bytenr and type |
29 | */ | 203 | */ |
@@ -957,13 +1131,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, | |||
957 | } | 1131 | } |
958 | 1132 | ||
959 | /* | 1133 | /* |
960 | * this does a simple search for the head node for a given extent. | 1134 | * This does a simple search for the head node for a given extent. Returns the |
961 | * It must be called with the delayed ref spinlock held, and it returns | 1135 | * head node if found, or NULL if not. |
962 | * the head node if any where found, or NULL if not. | ||
963 | */ | 1136 | */ |
964 | struct btrfs_delayed_ref_head * | 1137 | struct btrfs_delayed_ref_head * |
965 | btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) | 1138 | btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) |
966 | { | 1139 | { |
1140 | lockdep_assert_held(&delayed_refs->lock); | ||
1141 | |||
967 | return find_ref_head(delayed_refs, bytenr, false); | 1142 | return find_ref_head(delayed_refs, bytenr, false); |
968 | } | 1143 | } |
969 | 1144 | ||
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c18f93ea88ed..1c977e6d45dc 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h | |||
@@ -364,6 +364,16 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( | |||
364 | 364 | ||
365 | int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); | 365 | int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); |
366 | 366 | ||
367 | void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); | ||
368 | void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); | ||
369 | int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, | ||
370 | enum btrfs_reserve_flush_enum flush); | ||
371 | void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, | ||
372 | struct btrfs_block_rsv *src, | ||
373 | u64 num_bytes); | ||
374 | int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); | ||
375 | bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); | ||
376 | |||
367 | /* | 377 | /* |
368 | * helper functions to cast a node into its container | 378 | * helper functions to cast a node into its container |
369 | */ | 379 | */ |
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index ee0989c7e3a9..6b2e9aa83ffa 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c | |||
@@ -201,7 +201,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, | |||
201 | return PTR_ERR(bdev); | 201 | return PTR_ERR(bdev); |
202 | } | 202 | } |
203 | 203 | ||
204 | filemap_write_and_wait(bdev->bd_inode->i_mapping); | 204 | sync_blockdev(bdev); |
205 | 205 | ||
206 | devices = &fs_info->fs_devices->devices; | 206 | devices = &fs_info->fs_devices->devices; |
207 | list_for_each_entry(device, devices, dev_list) { | 207 | list_for_each_entry(device, devices, dev_list) { |
@@ -237,7 +237,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, | |||
237 | } | 237 | } |
238 | rcu_assign_pointer(device->name, name); | 238 | rcu_assign_pointer(device->name, name); |
239 | 239 | ||
240 | mutex_lock(&fs_info->fs_devices->device_list_mutex); | ||
241 | set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); | 240 | set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); |
242 | device->generation = 0; | 241 | device->generation = 0; |
243 | device->io_width = fs_info->sectorsize; | 242 | device->io_width = fs_info->sectorsize; |
@@ -256,6 +255,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, | |||
256 | device->dev_stats_valid = 1; | 255 | device->dev_stats_valid = 1; |
257 | set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); | 256 | set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); |
258 | device->fs_devices = fs_info->fs_devices; | 257 | device->fs_devices = fs_info->fs_devices; |
258 | |||
259 | mutex_lock(&fs_info->fs_devices->device_list_mutex); | ||
259 | list_add(&device->dev_list, &fs_info->fs_devices->devices); | 260 | list_add(&device->dev_list, &fs_info->fs_devices->devices); |
260 | fs_info->fs_devices->num_devices++; | 261 | fs_info->fs_devices->num_devices++; |
261 | fs_info->fs_devices->open_devices++; | 262 | fs_info->fs_devices->open_devices++; |
@@ -399,7 +400,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, | |||
399 | int ret; | 400 | int ret; |
400 | struct btrfs_device *tgt_device = NULL; | 401 | struct btrfs_device *tgt_device = NULL; |
401 | struct btrfs_device *src_device = NULL; | 402 | struct btrfs_device *src_device = NULL; |
402 | bool need_unlock; | ||
403 | 403 | ||
404 | src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, | 404 | src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, |
405 | srcdev_name); | 405 | srcdev_name); |
@@ -413,11 +413,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, | |||
413 | return -ETXTBSY; | 413 | return -ETXTBSY; |
414 | } | 414 | } |
415 | 415 | ||
416 | ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, | ||
417 | src_device, &tgt_device); | ||
418 | if (ret) | ||
419 | return ret; | ||
420 | |||
421 | /* | 416 | /* |
422 | * Here we commit the transaction to make sure commit_total_bytes | 417 | * Here we commit the transaction to make sure commit_total_bytes |
423 | * of all the devices are updated. | 418 | * of all the devices are updated. |
@@ -431,7 +426,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, | |||
431 | return PTR_ERR(trans); | 426 | return PTR_ERR(trans); |
432 | } | 427 | } |
433 | 428 | ||
434 | need_unlock = true; | 429 | ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, |
430 | src_device, &tgt_device); | ||
431 | if (ret) | ||
432 | return ret; | ||
433 | |||
435 | down_write(&dev_replace->rwsem); | 434 | down_write(&dev_replace->rwsem); |
436 | switch (dev_replace->replace_state) { | 435 | switch (dev_replace->replace_state) { |
437 | case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: | 436 | case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: |
@@ -442,11 +441,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, | |||
442 | case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: | 441 | case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: |
443 | ASSERT(0); | 442 | ASSERT(0); |
444 | ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; | 443 | ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; |
444 | up_write(&dev_replace->rwsem); | ||
445 | goto leave; | 445 | goto leave; |
446 | } | 446 | } |
447 | 447 | ||
448 | dev_replace->cont_reading_from_srcdev_mode = read_src; | 448 | dev_replace->cont_reading_from_srcdev_mode = read_src; |
449 | WARN_ON(!src_device); | ||
450 | dev_replace->srcdev = src_device; | 449 | dev_replace->srcdev = src_device; |
451 | dev_replace->tgtdev = tgt_device; | 450 | dev_replace->tgtdev = tgt_device; |
452 | 451 | ||
@@ -471,7 +470,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, | |||
471 | atomic64_set(&dev_replace->num_write_errors, 0); | 470 | atomic64_set(&dev_replace->num_write_errors, 0); |
472 | atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); | 471 | atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); |
473 | up_write(&dev_replace->rwsem); | 472 | up_write(&dev_replace->rwsem); |
474 | need_unlock = false; | ||
475 | 473 | ||
476 | ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); | 474 | ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); |
477 | if (ret) | 475 | if (ret) |
@@ -479,16 +477,16 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, | |||
479 | 477 | ||
480 | btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); | 478 | btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); |
481 | 479 | ||
482 | /* force writing the updated state information to disk */ | 480 | /* Commit dev_replace state and reserve 1 item for it. */ |
483 | trans = btrfs_start_transaction(root, 0); | 481 | trans = btrfs_start_transaction(root, 1); |
484 | if (IS_ERR(trans)) { | 482 | if (IS_ERR(trans)) { |
485 | ret = PTR_ERR(trans); | 483 | ret = PTR_ERR(trans); |
486 | need_unlock = true; | ||
487 | down_write(&dev_replace->rwsem); | 484 | down_write(&dev_replace->rwsem); |
488 | dev_replace->replace_state = | 485 | dev_replace->replace_state = |
489 | BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; | 486 | BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; |
490 | dev_replace->srcdev = NULL; | 487 | dev_replace->srcdev = NULL; |
491 | dev_replace->tgtdev = NULL; | 488 | dev_replace->tgtdev = NULL; |
489 | up_write(&dev_replace->rwsem); | ||
492 | goto leave; | 490 | goto leave; |
493 | } | 491 | } |
494 | 492 | ||
@@ -510,8 +508,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, | |||
510 | return ret; | 508 | return ret; |
511 | 509 | ||
512 | leave: | 510 | leave: |
513 | if (need_unlock) | ||
514 | up_write(&dev_replace->rwsem); | ||
515 | btrfs_destroy_dev_replace_tgtdev(tgt_device); | 511 | btrfs_destroy_dev_replace_tgtdev(tgt_device); |
516 | return ret; | 512 | return ret; |
517 | } | 513 | } |
@@ -678,7 +674,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, | |||
678 | btrfs_device_set_disk_total_bytes(tgt_device, | 674 | btrfs_device_set_disk_total_bytes(tgt_device, |
679 | src_device->disk_total_bytes); | 675 | src_device->disk_total_bytes); |
680 | btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); | 676 | btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); |
681 | tgt_device->commit_total_bytes = src_device->commit_total_bytes; | ||
682 | tgt_device->commit_bytes_used = src_device->bytes_used; | 677 | tgt_device->commit_bytes_used = src_device->bytes_used; |
683 | 678 | ||
684 | btrfs_assign_next_active_device(src_device, tgt_device); | 679 | btrfs_assign_next_active_device(src_device, tgt_device); |
@@ -728,7 +723,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( | |||
728 | struct btrfs_device *srcdev, | 723 | struct btrfs_device *srcdev, |
729 | struct btrfs_device *tgtdev) | 724 | struct btrfs_device *tgtdev) |
730 | { | 725 | { |
731 | struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; | 726 | struct extent_map_tree *em_tree = &fs_info->mapping_tree; |
732 | struct extent_map *em; | 727 | struct extent_map *em; |
733 | struct map_lookup *map; | 728 | struct map_lookup *map; |
734 | u64 start = 0; | 729 | u64 start = 0; |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index deb74a8c191a..41a2bd2e0c56 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/crc32c.h> | 19 | #include <linux/crc32c.h> |
20 | #include <linux/sched/mm.h> | 20 | #include <linux/sched/mm.h> |
21 | #include <asm/unaligned.h> | 21 | #include <asm/unaligned.h> |
22 | #include <crypto/hash.h> | ||
22 | #include "ctree.h" | 23 | #include "ctree.h" |
23 | #include "disk-io.h" | 24 | #include "disk-io.h" |
24 | #include "transaction.h" | 25 | #include "transaction.h" |
@@ -40,10 +41,6 @@ | |||
40 | #include "tree-checker.h" | 41 | #include "tree-checker.h" |
41 | #include "ref-verify.h" | 42 | #include "ref-verify.h" |
42 | 43 | ||
43 | #ifdef CONFIG_X86 | ||
44 | #include <asm/cpufeature.h> | ||
45 | #endif | ||
46 | |||
47 | #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ | 44 | #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ |
48 | BTRFS_HEADER_FLAG_RELOC |\ | 45 | BTRFS_HEADER_FLAG_RELOC |\ |
49 | BTRFS_SUPER_FLAG_ERROR |\ | 46 | BTRFS_SUPER_FLAG_ERROR |\ |
@@ -249,16 +246,6 @@ out: | |||
249 | return em; | 246 | return em; |
250 | } | 247 | } |
251 | 248 | ||
252 | u32 btrfs_csum_data(const char *data, u32 seed, size_t len) | ||
253 | { | ||
254 | return crc32c(seed, data, len); | ||
255 | } | ||
256 | |||
257 | void btrfs_csum_final(u32 crc, u8 *result) | ||
258 | { | ||
259 | put_unaligned_le32(~crc, result); | ||
260 | } | ||
261 | |||
262 | /* | 249 | /* |
263 | * Compute the csum of a btree block and store the result to provided buffer. | 250 | * Compute the csum of a btree block and store the result to provided buffer. |
264 | * | 251 | * |
@@ -266,6 +253,8 @@ void btrfs_csum_final(u32 crc, u8 *result) | |||
266 | */ | 253 | */ |
267 | static int csum_tree_block(struct extent_buffer *buf, u8 *result) | 254 | static int csum_tree_block(struct extent_buffer *buf, u8 *result) |
268 | { | 255 | { |
256 | struct btrfs_fs_info *fs_info = buf->fs_info; | ||
257 | SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); | ||
269 | unsigned long len; | 258 | unsigned long len; |
270 | unsigned long cur_len; | 259 | unsigned long cur_len; |
271 | unsigned long offset = BTRFS_CSUM_SIZE; | 260 | unsigned long offset = BTRFS_CSUM_SIZE; |
@@ -273,9 +262,12 @@ static int csum_tree_block(struct extent_buffer *buf, u8 *result) | |||
273 | unsigned long map_start; | 262 | unsigned long map_start; |
274 | unsigned long map_len; | 263 | unsigned long map_len; |
275 | int err; | 264 | int err; |
276 | u32 crc = ~(u32)0; | 265 | |
266 | shash->tfm = fs_info->csum_shash; | ||
267 | crypto_shash_init(shash); | ||
277 | 268 | ||
278 | len = buf->len - offset; | 269 | len = buf->len - offset; |
270 | |||
279 | while (len > 0) { | 271 | while (len > 0) { |
280 | /* | 272 | /* |
281 | * Note: we don't need to check for the err == 1 case here, as | 273 | * Note: we don't need to check for the err == 1 case here, as |
@@ -288,14 +280,13 @@ static int csum_tree_block(struct extent_buffer *buf, u8 *result) | |||
288 | if (WARN_ON(err)) | 280 | if (WARN_ON(err)) |
289 | return err; | 281 | return err; |
290 | cur_len = min(len, map_len - (offset - map_start)); | 282 | cur_len = min(len, map_len - (offset - map_start)); |
291 | crc = btrfs_csum_data(kaddr + offset - map_start, | 283 | crypto_shash_update(shash, kaddr + offset - map_start, cur_len); |
292 | crc, cur_len); | ||
293 | len -= cur_len; | 284 | len -= cur_len; |
294 | offset += cur_len; | 285 | offset += cur_len; |
295 | } | 286 | } |
296 | memset(result, 0, BTRFS_CSUM_SIZE); | 287 | memset(result, 0, BTRFS_CSUM_SIZE); |
297 | 288 | ||
298 | btrfs_csum_final(crc, result); | 289 | crypto_shash_final(shash, result); |
299 | 290 | ||
300 | return 0; | 291 | return 0; |
301 | } | 292 | } |
@@ -356,6 +347,16 @@ out: | |||
356 | return ret; | 347 | return ret; |
357 | } | 348 | } |
358 | 349 | ||
350 | static bool btrfs_supported_super_csum(u16 csum_type) | ||
351 | { | ||
352 | switch (csum_type) { | ||
353 | case BTRFS_CSUM_TYPE_CRC32: | ||
354 | return true; | ||
355 | default: | ||
356 | return false; | ||
357 | } | ||
358 | } | ||
359 | |||
359 | /* | 360 | /* |
360 | * Return 0 if the superblock checksum type matches the checksum value of that | 361 | * Return 0 if the superblock checksum type matches the checksum value of that |
361 | * algorithm. Pass the raw disk superblock data. | 362 | * algorithm. Pass the raw disk superblock data. |
@@ -365,33 +366,25 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, | |||
365 | { | 366 | { |
366 | struct btrfs_super_block *disk_sb = | 367 | struct btrfs_super_block *disk_sb = |
367 | (struct btrfs_super_block *)raw_disk_sb; | 368 | (struct btrfs_super_block *)raw_disk_sb; |
368 | u16 csum_type = btrfs_super_csum_type(disk_sb); | 369 | char result[BTRFS_CSUM_SIZE]; |
369 | int ret = 0; | 370 | SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); |
370 | 371 | ||
371 | if (csum_type == BTRFS_CSUM_TYPE_CRC32) { | 372 | shash->tfm = fs_info->csum_shash; |
372 | u32 crc = ~(u32)0; | 373 | crypto_shash_init(shash); |
373 | char result[sizeof(crc)]; | ||
374 | 374 | ||
375 | /* | 375 | /* |
376 | * The super_block structure does not span the whole | 376 | * The super_block structure does not span the whole |
377 | * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space | 377 | * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is |
378 | * is filled with zeros and is included in the checksum. | 378 | * filled with zeros and is included in the checksum. |
379 | */ | 379 | */ |
380 | crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, | 380 | crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE, |
381 | crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); | 381 | BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); |
382 | btrfs_csum_final(crc, result); | 382 | crypto_shash_final(shash, result); |
383 | 383 | ||
384 | if (memcmp(raw_disk_sb, result, sizeof(result))) | 384 | if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb))) |
385 | ret = 1; | 385 | return 1; |
386 | } | ||
387 | 386 | ||
388 | if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { | 387 | return 0; |
389 | btrfs_err(fs_info, "unsupported checksum algorithm %u", | ||
390 | csum_type); | ||
391 | ret = 1; | ||
392 | } | ||
393 | |||
394 | return ret; | ||
395 | } | 388 | } |
396 | 389 | ||
397 | int btrfs_verify_level_key(struct extent_buffer *eb, int level, | 390 | int btrfs_verify_level_key(struct extent_buffer *eb, int level, |
@@ -873,14 +866,13 @@ static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio, | |||
873 | return btree_csum_one_bio(bio); | 866 | return btree_csum_one_bio(bio); |
874 | } | 867 | } |
875 | 868 | ||
876 | static int check_async_write(struct btrfs_inode *bi) | 869 | static int check_async_write(struct btrfs_fs_info *fs_info, |
870 | struct btrfs_inode *bi) | ||
877 | { | 871 | { |
878 | if (atomic_read(&bi->sync_writers)) | 872 | if (atomic_read(&bi->sync_writers)) |
879 | return 0; | 873 | return 0; |
880 | #ifdef CONFIG_X86 | 874 | if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) |
881 | if (static_cpu_has(X86_FEATURE_XMM4_2)) | ||
882 | return 0; | 875 | return 0; |
883 | #endif | ||
884 | return 1; | 876 | return 1; |
885 | } | 877 | } |
886 | 878 | ||
@@ -889,7 +881,7 @@ static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, | |||
889 | unsigned long bio_flags) | 881 | unsigned long bio_flags) |
890 | { | 882 | { |
891 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); | 883 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); |
892 | int async = check_async_write(BTRFS_I(inode)); | 884 | int async = check_async_write(fs_info, BTRFS_I(inode)); |
893 | blk_status_t ret; | 885 | blk_status_t ret; |
894 | 886 | ||
895 | if (bio_op(bio) != REQ_OP_WRITE) { | 887 | if (bio_op(bio) != REQ_OP_WRITE) { |
@@ -2262,6 +2254,29 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, | |||
2262 | return 0; | 2254 | return 0; |
2263 | } | 2255 | } |
2264 | 2256 | ||
2257 | static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) | ||
2258 | { | ||
2259 | struct crypto_shash *csum_shash; | ||
2260 | const char *csum_name = btrfs_super_csum_name(csum_type); | ||
2261 | |||
2262 | csum_shash = crypto_alloc_shash(csum_name, 0, 0); | ||
2263 | |||
2264 | if (IS_ERR(csum_shash)) { | ||
2265 | btrfs_err(fs_info, "error allocating %s hash for checksum", | ||
2266 | csum_name); | ||
2267 | return PTR_ERR(csum_shash); | ||
2268 | } | ||
2269 | |||
2270 | fs_info->csum_shash = csum_shash; | ||
2271 | |||
2272 | return 0; | ||
2273 | } | ||
2274 | |||
2275 | static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) | ||
2276 | { | ||
2277 | crypto_free_shash(fs_info->csum_shash); | ||
2278 | } | ||
2279 | |||
2265 | static int btrfs_replay_log(struct btrfs_fs_info *fs_info, | 2280 | static int btrfs_replay_log(struct btrfs_fs_info *fs_info, |
2266 | struct btrfs_fs_devices *fs_devices) | 2281 | struct btrfs_fs_devices *fs_devices) |
2267 | { | 2282 | { |
@@ -2577,7 +2592,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, | |||
2577 | ret = validate_super(fs_info, sb, -1); | 2592 | ret = validate_super(fs_info, sb, -1); |
2578 | if (ret < 0) | 2593 | if (ret < 0) |
2579 | goto out; | 2594 | goto out; |
2580 | if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) { | 2595 | if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { |
2581 | ret = -EUCLEAN; | 2596 | ret = -EUCLEAN; |
2582 | btrfs_err(fs_info, "invalid csum type, has %u want %u", | 2597 | btrfs_err(fs_info, "invalid csum type, has %u want %u", |
2583 | btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); | 2598 | btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); |
@@ -2607,6 +2622,7 @@ int open_ctree(struct super_block *sb, | |||
2607 | u32 stripesize; | 2622 | u32 stripesize; |
2608 | u64 generation; | 2623 | u64 generation; |
2609 | u64 features; | 2624 | u64 features; |
2625 | u16 csum_type; | ||
2610 | struct btrfs_key location; | 2626 | struct btrfs_key location; |
2611 | struct buffer_head *bh; | 2627 | struct buffer_head *bh; |
2612 | struct btrfs_super_block *disk_super; | 2628 | struct btrfs_super_block *disk_super; |
@@ -2689,7 +2705,7 @@ int open_ctree(struct super_block *sb, | |||
2689 | INIT_LIST_HEAD(&fs_info->space_info); | 2705 | INIT_LIST_HEAD(&fs_info->space_info); |
2690 | INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); | 2706 | INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); |
2691 | INIT_LIST_HEAD(&fs_info->unused_bgs); | 2707 | INIT_LIST_HEAD(&fs_info->unused_bgs); |
2692 | btrfs_mapping_init(&fs_info->mapping_tree); | 2708 | extent_map_tree_init(&fs_info->mapping_tree); |
2693 | btrfs_init_block_rsv(&fs_info->global_block_rsv, | 2709 | btrfs_init_block_rsv(&fs_info->global_block_rsv, |
2694 | BTRFS_BLOCK_RSV_GLOBAL); | 2710 | BTRFS_BLOCK_RSV_GLOBAL); |
2695 | btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); | 2711 | btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); |
@@ -2793,6 +2809,8 @@ int open_ctree(struct super_block *sb, | |||
2793 | spin_lock_init(&fs_info->swapfile_pins_lock); | 2809 | spin_lock_init(&fs_info->swapfile_pins_lock); |
2794 | fs_info->swapfile_pins = RB_ROOT; | 2810 | fs_info->swapfile_pins = RB_ROOT; |
2795 | 2811 | ||
2812 | fs_info->send_in_progress = 0; | ||
2813 | |||
2796 | ret = btrfs_alloc_stripe_hash_table(fs_info); | 2814 | ret = btrfs_alloc_stripe_hash_table(fs_info); |
2797 | if (ret) { | 2815 | if (ret) { |
2798 | err = ret; | 2816 | err = ret; |
@@ -2813,6 +2831,25 @@ int open_ctree(struct super_block *sb, | |||
2813 | } | 2831 | } |
2814 | 2832 | ||
2815 | /* | 2833 | /* |
2834 | * Verify the type first, if that or the the checksum value are | ||
2835 | * corrupted, we'll find out | ||
2836 | */ | ||
2837 | csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data); | ||
2838 | if (!btrfs_supported_super_csum(csum_type)) { | ||
2839 | btrfs_err(fs_info, "unsupported checksum algorithm: %u", | ||
2840 | csum_type); | ||
2841 | err = -EINVAL; | ||
2842 | brelse(bh); | ||
2843 | goto fail_alloc; | ||
2844 | } | ||
2845 | |||
2846 | ret = btrfs_init_csum_hash(fs_info, csum_type); | ||
2847 | if (ret) { | ||
2848 | err = ret; | ||
2849 | goto fail_alloc; | ||
2850 | } | ||
2851 | |||
2852 | /* | ||
2816 | * We want to check superblock checksum, the type is stored inside. | 2853 | * We want to check superblock checksum, the type is stored inside. |
2817 | * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). | 2854 | * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). |
2818 | */ | 2855 | */ |
@@ -2820,7 +2857,7 @@ int open_ctree(struct super_block *sb, | |||
2820 | btrfs_err(fs_info, "superblock checksum mismatch"); | 2857 | btrfs_err(fs_info, "superblock checksum mismatch"); |
2821 | err = -EINVAL; | 2858 | err = -EINVAL; |
2822 | brelse(bh); | 2859 | brelse(bh); |
2823 | goto fail_alloc; | 2860 | goto fail_csum; |
2824 | } | 2861 | } |
2825 | 2862 | ||
2826 | /* | 2863 | /* |
@@ -2857,11 +2894,11 @@ int open_ctree(struct super_block *sb, | |||
2857 | if (ret) { | 2894 | if (ret) { |
2858 | btrfs_err(fs_info, "superblock contains fatal errors"); | 2895 | btrfs_err(fs_info, "superblock contains fatal errors"); |
2859 | err = -EINVAL; | 2896 | err = -EINVAL; |
2860 | goto fail_alloc; | 2897 | goto fail_csum; |
2861 | } | 2898 | } |
2862 | 2899 | ||
2863 | if (!btrfs_super_root(disk_super)) | 2900 | if (!btrfs_super_root(disk_super)) |
2864 | goto fail_alloc; | 2901 | goto fail_csum; |
2865 | 2902 | ||
2866 | /* check FS state, whether FS is broken. */ | 2903 | /* check FS state, whether FS is broken. */ |
2867 | if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) | 2904 | if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) |
@@ -2883,7 +2920,7 @@ int open_ctree(struct super_block *sb, | |||
2883 | ret = btrfs_parse_options(fs_info, options, sb->s_flags); | 2920 | ret = btrfs_parse_options(fs_info, options, sb->s_flags); |
2884 | if (ret) { | 2921 | if (ret) { |
2885 | err = ret; | 2922 | err = ret; |
2886 | goto fail_alloc; | 2923 | goto fail_csum; |
2887 | } | 2924 | } |
2888 | 2925 | ||
2889 | features = btrfs_super_incompat_flags(disk_super) & | 2926 | features = btrfs_super_incompat_flags(disk_super) & |
@@ -2893,7 +2930,7 @@ int open_ctree(struct super_block *sb, | |||
2893 | "cannot mount because of unsupported optional features (%llx)", | 2930 | "cannot mount because of unsupported optional features (%llx)", |
2894 | features); | 2931 | features); |
2895 | err = -EINVAL; | 2932 | err = -EINVAL; |
2896 | goto fail_alloc; | 2933 | goto fail_csum; |
2897 | } | 2934 | } |
2898 | 2935 | ||
2899 | features = btrfs_super_incompat_flags(disk_super); | 2936 | features = btrfs_super_incompat_flags(disk_super); |
@@ -2937,7 +2974,7 @@ int open_ctree(struct super_block *sb, | |||
2937 | btrfs_err(fs_info, | 2974 | btrfs_err(fs_info, |
2938 | "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", | 2975 | "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", |
2939 | nodesize, sectorsize); | 2976 | nodesize, sectorsize); |
2940 | goto fail_alloc; | 2977 | goto fail_csum; |
2941 | } | 2978 | } |
2942 | 2979 | ||
2943 | /* | 2980 | /* |
@@ -2953,7 +2990,7 @@ int open_ctree(struct super_block *sb, | |||
2953 | "cannot mount read-write because of unsupported optional features (%llx)", | 2990 | "cannot mount read-write because of unsupported optional features (%llx)", |
2954 | features); | 2991 | features); |
2955 | err = -EINVAL; | 2992 | err = -EINVAL; |
2956 | goto fail_alloc; | 2993 | goto fail_csum; |
2957 | } | 2994 | } |
2958 | 2995 | ||
2959 | ret = btrfs_init_workqueues(fs_info, fs_devices); | 2996 | ret = btrfs_init_workqueues(fs_info, fs_devices); |
@@ -3331,6 +3368,8 @@ fail_tree_roots: | |||
3331 | fail_sb_buffer: | 3368 | fail_sb_buffer: |
3332 | btrfs_stop_all_workers(fs_info); | 3369 | btrfs_stop_all_workers(fs_info); |
3333 | btrfs_free_block_groups(fs_info); | 3370 | btrfs_free_block_groups(fs_info); |
3371 | fail_csum: | ||
3372 | btrfs_free_csum_hash(fs_info); | ||
3334 | fail_alloc: | 3373 | fail_alloc: |
3335 | fail_iput: | 3374 | fail_iput: |
3336 | btrfs_mapping_tree_free(&fs_info->mapping_tree); | 3375 | btrfs_mapping_tree_free(&fs_info->mapping_tree); |
@@ -3472,17 +3511,20 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) | |||
3472 | static int write_dev_supers(struct btrfs_device *device, | 3511 | static int write_dev_supers(struct btrfs_device *device, |
3473 | struct btrfs_super_block *sb, int max_mirrors) | 3512 | struct btrfs_super_block *sb, int max_mirrors) |
3474 | { | 3513 | { |
3514 | struct btrfs_fs_info *fs_info = device->fs_info; | ||
3515 | SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); | ||
3475 | struct buffer_head *bh; | 3516 | struct buffer_head *bh; |
3476 | int i; | 3517 | int i; |
3477 | int ret; | 3518 | int ret; |
3478 | int errors = 0; | 3519 | int errors = 0; |
3479 | u32 crc; | ||
3480 | u64 bytenr; | 3520 | u64 bytenr; |
3481 | int op_flags; | 3521 | int op_flags; |
3482 | 3522 | ||
3483 | if (max_mirrors == 0) | 3523 | if (max_mirrors == 0) |
3484 | max_mirrors = BTRFS_SUPER_MIRROR_MAX; | 3524 | max_mirrors = BTRFS_SUPER_MIRROR_MAX; |
3485 | 3525 | ||
3526 | shash->tfm = fs_info->csum_shash; | ||
3527 | |||
3486 | for (i = 0; i < max_mirrors; i++) { | 3528 | for (i = 0; i < max_mirrors; i++) { |
3487 | bytenr = btrfs_sb_offset(i); | 3529 | bytenr = btrfs_sb_offset(i); |
3488 | if (bytenr + BTRFS_SUPER_INFO_SIZE >= | 3530 | if (bytenr + BTRFS_SUPER_INFO_SIZE >= |
@@ -3491,10 +3533,10 @@ static int write_dev_supers(struct btrfs_device *device, | |||
3491 | 3533 | ||
3492 | btrfs_set_super_bytenr(sb, bytenr); | 3534 | btrfs_set_super_bytenr(sb, bytenr); |
3493 | 3535 | ||
3494 | crc = ~(u32)0; | 3536 | crypto_shash_init(shash); |
3495 | crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc, | 3537 | crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE, |
3496 | BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); | 3538 | BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); |
3497 | btrfs_csum_final(crc, sb->csum); | 3539 | crypto_shash_final(shash, sb->csum); |
3498 | 3540 | ||
3499 | /* One reference for us, and we leave it for the caller */ | 3541 | /* One reference for us, and we leave it for the caller */ |
3500 | bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, | 3542 | bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, |
@@ -3709,7 +3751,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) | |||
3709 | 3751 | ||
3710 | if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || | 3752 | if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || |
3711 | (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) | 3753 | (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) |
3712 | min_tolerated = min(min_tolerated, | 3754 | min_tolerated = min_t(int, min_tolerated, |
3713 | btrfs_raid_array[BTRFS_RAID_SINGLE]. | 3755 | btrfs_raid_array[BTRFS_RAID_SINGLE]. |
3714 | tolerated_failures); | 3756 | tolerated_failures); |
3715 | 3757 | ||
@@ -3718,7 +3760,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) | |||
3718 | continue; | 3760 | continue; |
3719 | if (!(flags & btrfs_raid_array[raid_type].bg_flag)) | 3761 | if (!(flags & btrfs_raid_array[raid_type].bg_flag)) |
3720 | continue; | 3762 | continue; |
3721 | min_tolerated = min(min_tolerated, | 3763 | min_tolerated = min_t(int, min_tolerated, |
3722 | btrfs_raid_array[raid_type]. | 3764 | btrfs_raid_array[raid_type]. |
3723 | tolerated_failures); | 3765 | tolerated_failures); |
3724 | } | 3766 | } |
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a0161aa1ea0b..e80f7c45a307 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h | |||
@@ -115,8 +115,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, | |||
115 | int atomic); | 115 | int atomic); |
116 | int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, | 116 | int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, |
117 | struct btrfs_key *first_key); | 117 | struct btrfs_key *first_key); |
118 | u32 btrfs_csum_data(const char *data, u32 seed, size_t len); | ||
119 | void btrfs_csum_final(u32 crc, u8 *result); | ||
120 | blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, | 118 | blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, |
121 | enum btrfs_wq_endio_type metadata); | 119 | enum btrfs_wq_endio_type metadata); |
122 | blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, | 120 | blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5faf057f6f37..d3b58e388535 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -28,46 +28,12 @@ | |||
28 | #include "sysfs.h" | 28 | #include "sysfs.h" |
29 | #include "qgroup.h" | 29 | #include "qgroup.h" |
30 | #include "ref-verify.h" | 30 | #include "ref-verify.h" |
31 | #include "space-info.h" | ||
32 | #include "block-rsv.h" | ||
33 | #include "delalloc-space.h" | ||
31 | 34 | ||
32 | #undef SCRAMBLE_DELAYED_REFS | 35 | #undef SCRAMBLE_DELAYED_REFS |
33 | 36 | ||
34 | /* | ||
35 | * control flags for do_chunk_alloc's force field | ||
36 | * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk | ||
37 | * if we really need one. | ||
38 | * | ||
39 | * CHUNK_ALLOC_LIMITED means to only try and allocate one | ||
40 | * if we have very few chunks already allocated. This is | ||
41 | * used as part of the clustering code to help make sure | ||
42 | * we have a good pool of storage to cluster in, without | ||
43 | * filling the FS with empty chunks | ||
44 | * | ||
45 | * CHUNK_ALLOC_FORCE means it must try to allocate one | ||
46 | * | ||
47 | */ | ||
48 | enum { | ||
49 | CHUNK_ALLOC_NO_FORCE = 0, | ||
50 | CHUNK_ALLOC_LIMITED = 1, | ||
51 | CHUNK_ALLOC_FORCE = 2, | ||
52 | }; | ||
53 | |||
54 | /* | ||
55 | * Declare a helper function to detect underflow of various space info members | ||
56 | */ | ||
57 | #define DECLARE_SPACE_INFO_UPDATE(name) \ | ||
58 | static inline void update_##name(struct btrfs_space_info *sinfo, \ | ||
59 | s64 bytes) \ | ||
60 | { \ | ||
61 | if (bytes < 0 && sinfo->name < -bytes) { \ | ||
62 | WARN_ON(1); \ | ||
63 | sinfo->name = 0; \ | ||
64 | return; \ | ||
65 | } \ | ||
66 | sinfo->name += bytes; \ | ||
67 | } | ||
68 | |||
69 | DECLARE_SPACE_INFO_UPDATE(bytes_may_use); | ||
70 | DECLARE_SPACE_INFO_UPDATE(bytes_pinned); | ||
71 | 37 | ||
72 | static int __btrfs_free_extent(struct btrfs_trans_handle *trans, | 38 | static int __btrfs_free_extent(struct btrfs_trans_handle *trans, |
73 | struct btrfs_delayed_ref_node *node, u64 parent, | 39 | struct btrfs_delayed_ref_node *node, u64 parent, |
@@ -84,21 +50,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, | |||
84 | static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, | 50 | static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, |
85 | struct btrfs_delayed_ref_node *node, | 51 | struct btrfs_delayed_ref_node *node, |
86 | struct btrfs_delayed_extent_op *extent_op); | 52 | struct btrfs_delayed_extent_op *extent_op); |
87 | static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, | ||
88 | int force); | ||
89 | static int find_next_key(struct btrfs_path *path, int level, | 53 | static int find_next_key(struct btrfs_path *path, int level, |
90 | struct btrfs_key *key); | 54 | struct btrfs_key *key); |
91 | static void dump_space_info(struct btrfs_fs_info *fs_info, | ||
92 | struct btrfs_space_info *info, u64 bytes, | ||
93 | int dump_block_groups); | ||
94 | static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, | ||
95 | u64 num_bytes); | ||
96 | static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, | ||
97 | struct btrfs_space_info *space_info, | ||
98 | u64 num_bytes); | ||
99 | static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, | ||
100 | struct btrfs_space_info *space_info, | ||
101 | u64 num_bytes); | ||
102 | 55 | ||
103 | static noinline int | 56 | static noinline int |
104 | block_group_cache_done(struct btrfs_block_group_cache *cache) | 57 | block_group_cache_done(struct btrfs_block_group_cache *cache) |
@@ -737,62 +690,39 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( | |||
737 | return block_group_cache_tree_search(info, bytenr, 1); | 690 | return block_group_cache_tree_search(info, bytenr, 1); |
738 | } | 691 | } |
739 | 692 | ||
740 | static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, | 693 | static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) |
741 | u64 flags) | ||
742 | { | 694 | { |
743 | struct list_head *head = &info->space_info; | 695 | if (ref->type == BTRFS_REF_METADATA) { |
744 | struct btrfs_space_info *found; | 696 | if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) |
745 | 697 | return BTRFS_BLOCK_GROUP_SYSTEM; | |
746 | flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; | 698 | else |
747 | 699 | return BTRFS_BLOCK_GROUP_METADATA; | |
748 | rcu_read_lock(); | ||
749 | list_for_each_entry_rcu(found, head, list) { | ||
750 | if (found->flags & flags) { | ||
751 | rcu_read_unlock(); | ||
752 | return found; | ||
753 | } | ||
754 | } | 700 | } |
755 | rcu_read_unlock(); | 701 | return BTRFS_BLOCK_GROUP_DATA; |
756 | return NULL; | ||
757 | } | 702 | } |
758 | 703 | ||
759 | static void add_pinned_bytes(struct btrfs_fs_info *fs_info, | 704 | static void add_pinned_bytes(struct btrfs_fs_info *fs_info, |
760 | struct btrfs_ref *ref, int sign) | 705 | struct btrfs_ref *ref) |
761 | { | 706 | { |
762 | struct btrfs_space_info *space_info; | 707 | struct btrfs_space_info *space_info; |
763 | s64 num_bytes; | 708 | u64 flags = generic_ref_to_space_flags(ref); |
764 | u64 flags; | ||
765 | |||
766 | ASSERT(sign == 1 || sign == -1); | ||
767 | num_bytes = sign * ref->len; | ||
768 | if (ref->type == BTRFS_REF_METADATA) { | ||
769 | if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) | ||
770 | flags = BTRFS_BLOCK_GROUP_SYSTEM; | ||
771 | else | ||
772 | flags = BTRFS_BLOCK_GROUP_METADATA; | ||
773 | } else { | ||
774 | flags = BTRFS_BLOCK_GROUP_DATA; | ||
775 | } | ||
776 | 709 | ||
777 | space_info = __find_space_info(fs_info, flags); | 710 | space_info = btrfs_find_space_info(fs_info, flags); |
778 | ASSERT(space_info); | 711 | ASSERT(space_info); |
779 | percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes, | 712 | percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, |
780 | BTRFS_TOTAL_BYTES_PINNED_BATCH); | 713 | BTRFS_TOTAL_BYTES_PINNED_BATCH); |
781 | } | 714 | } |
782 | 715 | ||
783 | /* | 716 | static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, |
784 | * after adding space to the filesystem, we need to clear the full flags | 717 | struct btrfs_ref *ref) |
785 | * on all the space infos. | ||
786 | */ | ||
787 | void btrfs_clear_space_info_full(struct btrfs_fs_info *info) | ||
788 | { | 718 | { |
789 | struct list_head *head = &info->space_info; | 719 | struct btrfs_space_info *space_info; |
790 | struct btrfs_space_info *found; | 720 | u64 flags = generic_ref_to_space_flags(ref); |
791 | 721 | ||
792 | rcu_read_lock(); | 722 | space_info = btrfs_find_space_info(fs_info, flags); |
793 | list_for_each_entry_rcu(found, head, list) | 723 | ASSERT(space_info); |
794 | found->full = 0; | 724 | percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, |
795 | rcu_read_unlock(); | 725 | BTRFS_TOTAL_BYTES_PINNED_BATCH); |
796 | } | 726 | } |
797 | 727 | ||
798 | /* simple helper to search for an existing data extent at a given offset */ | 728 | /* simple helper to search for an existing data extent at a given offset */ |
@@ -1121,11 +1051,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) | |||
1121 | __le64 lenum; | 1051 | __le64 lenum; |
1122 | 1052 | ||
1123 | lenum = cpu_to_le64(root_objectid); | 1053 | lenum = cpu_to_le64(root_objectid); |
1124 | high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); | 1054 | high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); |
1125 | lenum = cpu_to_le64(owner); | 1055 | lenum = cpu_to_le64(owner); |
1126 | low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); | 1056 | low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); |
1127 | lenum = cpu_to_le64(offset); | 1057 | lenum = cpu_to_le64(offset); |
1128 | low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); | 1058 | low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); |
1129 | 1059 | ||
1130 | return ((u64)high_crc << 31) ^ (u64)low_crc; | 1060 | return ((u64)high_crc << 31) ^ (u64)low_crc; |
1131 | } | 1061 | } |
@@ -2065,7 +1995,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, | |||
2065 | btrfs_ref_tree_mod(fs_info, generic_ref); | 1995 | btrfs_ref_tree_mod(fs_info, generic_ref); |
2066 | 1996 | ||
2067 | if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) | 1997 | if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) |
2068 | add_pinned_bytes(fs_info, generic_ref, -1); | 1998 | sub_pinned_bytes(fs_info, generic_ref); |
2069 | 1999 | ||
2070 | return ret; | 2000 | return ret; |
2071 | } | 2001 | } |
@@ -2462,7 +2392,7 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, | |||
2462 | flags = BTRFS_BLOCK_GROUP_SYSTEM; | 2392 | flags = BTRFS_BLOCK_GROUP_SYSTEM; |
2463 | else | 2393 | else |
2464 | flags = BTRFS_BLOCK_GROUP_METADATA; | 2394 | flags = BTRFS_BLOCK_GROUP_METADATA; |
2465 | space_info = __find_space_info(fs_info, flags); | 2395 | space_info = btrfs_find_space_info(fs_info, flags); |
2466 | ASSERT(space_info); | 2396 | ASSERT(space_info); |
2467 | percpu_counter_add_batch(&space_info->total_bytes_pinned, | 2397 | percpu_counter_add_batch(&space_info->total_bytes_pinned, |
2468 | -head->num_bytes, | 2398 | -head->num_bytes, |
@@ -2824,49 +2754,6 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) | |||
2824 | return num_csums; | 2754 | return num_csums; |
2825 | } | 2755 | } |
2826 | 2756 | ||
2827 | bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) | ||
2828 | { | ||
2829 | struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; | ||
2830 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
2831 | bool ret = false; | ||
2832 | u64 reserved; | ||
2833 | |||
2834 | spin_lock(&global_rsv->lock); | ||
2835 | reserved = global_rsv->reserved; | ||
2836 | spin_unlock(&global_rsv->lock); | ||
2837 | |||
2838 | /* | ||
2839 | * Since the global reserve is just kind of magic we don't really want | ||
2840 | * to rely on it to save our bacon, so if our size is more than the | ||
2841 | * delayed_refs_rsv and the global rsv then it's time to think about | ||
2842 | * bailing. | ||
2843 | */ | ||
2844 | spin_lock(&delayed_refs_rsv->lock); | ||
2845 | reserved += delayed_refs_rsv->reserved; | ||
2846 | if (delayed_refs_rsv->size >= reserved) | ||
2847 | ret = true; | ||
2848 | spin_unlock(&delayed_refs_rsv->lock); | ||
2849 | return ret; | ||
2850 | } | ||
2851 | |||
2852 | int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) | ||
2853 | { | ||
2854 | u64 num_entries = | ||
2855 | atomic_read(&trans->transaction->delayed_refs.num_entries); | ||
2856 | u64 avg_runtime; | ||
2857 | u64 val; | ||
2858 | |||
2859 | smp_mb(); | ||
2860 | avg_runtime = trans->fs_info->avg_delayed_ref_runtime; | ||
2861 | val = num_entries * avg_runtime; | ||
2862 | if (val >= NSEC_PER_SEC) | ||
2863 | return 1; | ||
2864 | if (val >= NSEC_PER_SEC / 2) | ||
2865 | return 2; | ||
2866 | |||
2867 | return btrfs_check_space_for_delayed_refs(trans->fs_info); | ||
2868 | } | ||
2869 | |||
2870 | /* | 2757 | /* |
2871 | * this starts processing the delayed reference count updates and | 2758 | * this starts processing the delayed reference count updates and |
2872 | * extent insertions we have queued up so far. count can be | 2759 | * extent insertions we have queued up so far. count can be |
@@ -3834,93 +3721,6 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) | |||
3834 | wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); | 3721 | wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); |
3835 | } | 3722 | } |
3836 | 3723 | ||
3837 | static const char *alloc_name(u64 flags) | ||
3838 | { | ||
3839 | switch (flags) { | ||
3840 | case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: | ||
3841 | return "mixed"; | ||
3842 | case BTRFS_BLOCK_GROUP_METADATA: | ||
3843 | return "metadata"; | ||
3844 | case BTRFS_BLOCK_GROUP_DATA: | ||
3845 | return "data"; | ||
3846 | case BTRFS_BLOCK_GROUP_SYSTEM: | ||
3847 | return "system"; | ||
3848 | default: | ||
3849 | WARN_ON(1); | ||
3850 | return "invalid-combination"; | ||
3851 | }; | ||
3852 | } | ||
3853 | |||
3854 | static int create_space_info(struct btrfs_fs_info *info, u64 flags) | ||
3855 | { | ||
3856 | |||
3857 | struct btrfs_space_info *space_info; | ||
3858 | int i; | ||
3859 | int ret; | ||
3860 | |||
3861 | space_info = kzalloc(sizeof(*space_info), GFP_NOFS); | ||
3862 | if (!space_info) | ||
3863 | return -ENOMEM; | ||
3864 | |||
3865 | ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, | ||
3866 | GFP_KERNEL); | ||
3867 | if (ret) { | ||
3868 | kfree(space_info); | ||
3869 | return ret; | ||
3870 | } | ||
3871 | |||
3872 | for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) | ||
3873 | INIT_LIST_HEAD(&space_info->block_groups[i]); | ||
3874 | init_rwsem(&space_info->groups_sem); | ||
3875 | spin_lock_init(&space_info->lock); | ||
3876 | space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; | ||
3877 | space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; | ||
3878 | init_waitqueue_head(&space_info->wait); | ||
3879 | INIT_LIST_HEAD(&space_info->ro_bgs); | ||
3880 | INIT_LIST_HEAD(&space_info->tickets); | ||
3881 | INIT_LIST_HEAD(&space_info->priority_tickets); | ||
3882 | |||
3883 | ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, | ||
3884 | info->space_info_kobj, "%s", | ||
3885 | alloc_name(space_info->flags)); | ||
3886 | if (ret) { | ||
3887 | kobject_put(&space_info->kobj); | ||
3888 | return ret; | ||
3889 | } | ||
3890 | |||
3891 | list_add_rcu(&space_info->list, &info->space_info); | ||
3892 | if (flags & BTRFS_BLOCK_GROUP_DATA) | ||
3893 | info->data_sinfo = space_info; | ||
3894 | |||
3895 | return ret; | ||
3896 | } | ||
3897 | |||
3898 | static void update_space_info(struct btrfs_fs_info *info, u64 flags, | ||
3899 | u64 total_bytes, u64 bytes_used, | ||
3900 | u64 bytes_readonly, | ||
3901 | struct btrfs_space_info **space_info) | ||
3902 | { | ||
3903 | struct btrfs_space_info *found; | ||
3904 | int factor; | ||
3905 | |||
3906 | factor = btrfs_bg_type_to_factor(flags); | ||
3907 | |||
3908 | found = __find_space_info(info, flags); | ||
3909 | ASSERT(found); | ||
3910 | spin_lock(&found->lock); | ||
3911 | found->total_bytes += total_bytes; | ||
3912 | found->disk_total += total_bytes * factor; | ||
3913 | found->bytes_used += bytes_used; | ||
3914 | found->disk_used += bytes_used * factor; | ||
3915 | found->bytes_readonly += bytes_readonly; | ||
3916 | if (total_bytes > 0) | ||
3917 | found->full = 0; | ||
3918 | space_info_add_new_bytes(info, found, total_bytes - | ||
3919 | bytes_used - bytes_readonly); | ||
3920 | spin_unlock(&found->lock); | ||
3921 | *space_info = found; | ||
3922 | } | ||
3923 | |||
3924 | static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) | 3724 | static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) |
3925 | { | 3725 | { |
3926 | u64 extra_flags = chunk_to_extended(flags) & | 3726 | u64 extra_flags = chunk_to_extended(flags) & |
@@ -4068,215 +3868,6 @@ u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) | |||
4068 | return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); | 3868 | return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); |
4069 | } | 3869 | } |
4070 | 3870 | ||
4071 | static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, | ||
4072 | bool may_use_included) | ||
4073 | { | ||
4074 | ASSERT(s_info); | ||
4075 | return s_info->bytes_used + s_info->bytes_reserved + | ||
4076 | s_info->bytes_pinned + s_info->bytes_readonly + | ||
4077 | (may_use_included ? s_info->bytes_may_use : 0); | ||
4078 | } | ||
4079 | |||
4080 | int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) | ||
4081 | { | ||
4082 | struct btrfs_root *root = inode->root; | ||
4083 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
4084 | struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; | ||
4085 | u64 used; | ||
4086 | int ret = 0; | ||
4087 | int need_commit = 2; | ||
4088 | int have_pinned_space; | ||
4089 | |||
4090 | /* make sure bytes are sectorsize aligned */ | ||
4091 | bytes = ALIGN(bytes, fs_info->sectorsize); | ||
4092 | |||
4093 | if (btrfs_is_free_space_inode(inode)) { | ||
4094 | need_commit = 0; | ||
4095 | ASSERT(current->journal_info); | ||
4096 | } | ||
4097 | |||
4098 | again: | ||
4099 | /* make sure we have enough space to handle the data first */ | ||
4100 | spin_lock(&data_sinfo->lock); | ||
4101 | used = btrfs_space_info_used(data_sinfo, true); | ||
4102 | |||
4103 | if (used + bytes > data_sinfo->total_bytes) { | ||
4104 | struct btrfs_trans_handle *trans; | ||
4105 | |||
4106 | /* | ||
4107 | * if we don't have enough free bytes in this space then we need | ||
4108 | * to alloc a new chunk. | ||
4109 | */ | ||
4110 | if (!data_sinfo->full) { | ||
4111 | u64 alloc_target; | ||
4112 | |||
4113 | data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; | ||
4114 | spin_unlock(&data_sinfo->lock); | ||
4115 | |||
4116 | alloc_target = btrfs_data_alloc_profile(fs_info); | ||
4117 | /* | ||
4118 | * It is ugly that we don't call nolock join | ||
4119 | * transaction for the free space inode case here. | ||
4120 | * But it is safe because we only do the data space | ||
4121 | * reservation for the free space cache in the | ||
4122 | * transaction context, the common join transaction | ||
4123 | * just increase the counter of the current transaction | ||
4124 | * handler, doesn't try to acquire the trans_lock of | ||
4125 | * the fs. | ||
4126 | */ | ||
4127 | trans = btrfs_join_transaction(root); | ||
4128 | if (IS_ERR(trans)) | ||
4129 | return PTR_ERR(trans); | ||
4130 | |||
4131 | ret = do_chunk_alloc(trans, alloc_target, | ||
4132 | CHUNK_ALLOC_NO_FORCE); | ||
4133 | btrfs_end_transaction(trans); | ||
4134 | if (ret < 0) { | ||
4135 | if (ret != -ENOSPC) | ||
4136 | return ret; | ||
4137 | else { | ||
4138 | have_pinned_space = 1; | ||
4139 | goto commit_trans; | ||
4140 | } | ||
4141 | } | ||
4142 | |||
4143 | goto again; | ||
4144 | } | ||
4145 | |||
4146 | /* | ||
4147 | * If we don't have enough pinned space to deal with this | ||
4148 | * allocation, and no removed chunk in current transaction, | ||
4149 | * don't bother committing the transaction. | ||
4150 | */ | ||
4151 | have_pinned_space = __percpu_counter_compare( | ||
4152 | &data_sinfo->total_bytes_pinned, | ||
4153 | used + bytes - data_sinfo->total_bytes, | ||
4154 | BTRFS_TOTAL_BYTES_PINNED_BATCH); | ||
4155 | spin_unlock(&data_sinfo->lock); | ||
4156 | |||
4157 | /* commit the current transaction and try again */ | ||
4158 | commit_trans: | ||
4159 | if (need_commit) { | ||
4160 | need_commit--; | ||
4161 | |||
4162 | if (need_commit > 0) { | ||
4163 | btrfs_start_delalloc_roots(fs_info, -1); | ||
4164 | btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, | ||
4165 | (u64)-1); | ||
4166 | } | ||
4167 | |||
4168 | trans = btrfs_join_transaction(root); | ||
4169 | if (IS_ERR(trans)) | ||
4170 | return PTR_ERR(trans); | ||
4171 | if (have_pinned_space >= 0 || | ||
4172 | test_bit(BTRFS_TRANS_HAVE_FREE_BGS, | ||
4173 | &trans->transaction->flags) || | ||
4174 | need_commit > 0) { | ||
4175 | ret = btrfs_commit_transaction(trans); | ||
4176 | if (ret) | ||
4177 | return ret; | ||
4178 | /* | ||
4179 | * The cleaner kthread might still be doing iput | ||
4180 | * operations. Wait for it to finish so that | ||
4181 | * more space is released. We don't need to | ||
4182 | * explicitly run the delayed iputs here because | ||
4183 | * the commit_transaction would have woken up | ||
4184 | * the cleaner. | ||
4185 | */ | ||
4186 | ret = btrfs_wait_on_delayed_iputs(fs_info); | ||
4187 | if (ret) | ||
4188 | return ret; | ||
4189 | goto again; | ||
4190 | } else { | ||
4191 | btrfs_end_transaction(trans); | ||
4192 | } | ||
4193 | } | ||
4194 | |||
4195 | trace_btrfs_space_reservation(fs_info, | ||
4196 | "space_info:enospc", | ||
4197 | data_sinfo->flags, bytes, 1); | ||
4198 | return -ENOSPC; | ||
4199 | } | ||
4200 | update_bytes_may_use(data_sinfo, bytes); | ||
4201 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
4202 | data_sinfo->flags, bytes, 1); | ||
4203 | spin_unlock(&data_sinfo->lock); | ||
4204 | |||
4205 | return 0; | ||
4206 | } | ||
4207 | |||
4208 | int btrfs_check_data_free_space(struct inode *inode, | ||
4209 | struct extent_changeset **reserved, u64 start, u64 len) | ||
4210 | { | ||
4211 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); | ||
4212 | int ret; | ||
4213 | |||
4214 | /* align the range */ | ||
4215 | len = round_up(start + len, fs_info->sectorsize) - | ||
4216 | round_down(start, fs_info->sectorsize); | ||
4217 | start = round_down(start, fs_info->sectorsize); | ||
4218 | |||
4219 | ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); | ||
4220 | if (ret < 0) | ||
4221 | return ret; | ||
4222 | |||
4223 | /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ | ||
4224 | ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); | ||
4225 | if (ret < 0) | ||
4226 | btrfs_free_reserved_data_space_noquota(inode, start, len); | ||
4227 | else | ||
4228 | ret = 0; | ||
4229 | return ret; | ||
4230 | } | ||
4231 | |||
4232 | /* | ||
4233 | * Called if we need to clear a data reservation for this inode | ||
4234 | * Normally in a error case. | ||
4235 | * | ||
4236 | * This one will *NOT* use accurate qgroup reserved space API, just for case | ||
4237 | * which we can't sleep and is sure it won't affect qgroup reserved space. | ||
4238 | * Like clear_bit_hook(). | ||
4239 | */ | ||
4240 | void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, | ||
4241 | u64 len) | ||
4242 | { | ||
4243 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); | ||
4244 | struct btrfs_space_info *data_sinfo; | ||
4245 | |||
4246 | /* Make sure the range is aligned to sectorsize */ | ||
4247 | len = round_up(start + len, fs_info->sectorsize) - | ||
4248 | round_down(start, fs_info->sectorsize); | ||
4249 | start = round_down(start, fs_info->sectorsize); | ||
4250 | |||
4251 | data_sinfo = fs_info->data_sinfo; | ||
4252 | spin_lock(&data_sinfo->lock); | ||
4253 | update_bytes_may_use(data_sinfo, -len); | ||
4254 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
4255 | data_sinfo->flags, len, 0); | ||
4256 | spin_unlock(&data_sinfo->lock); | ||
4257 | } | ||
4258 | |||
4259 | /* | ||
4260 | * Called if we need to clear a data reservation for this inode | ||
4261 | * Normally in a error case. | ||
4262 | * | ||
4263 | * This one will handle the per-inode data rsv map for accurate reserved | ||
4264 | * space framework. | ||
4265 | */ | ||
4266 | void btrfs_free_reserved_data_space(struct inode *inode, | ||
4267 | struct extent_changeset *reserved, u64 start, u64 len) | ||
4268 | { | ||
4269 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
4270 | |||
4271 | /* Make sure the range is aligned to sectorsize */ | ||
4272 | len = round_up(start + len, root->fs_info->sectorsize) - | ||
4273 | round_down(start, root->fs_info->sectorsize); | ||
4274 | start = round_down(start, root->fs_info->sectorsize); | ||
4275 | |||
4276 | btrfs_free_reserved_data_space_noquota(inode, start, len); | ||
4277 | btrfs_qgroup_free_data(inode, reserved, start, len); | ||
4278 | } | ||
4279 | |||
4280 | static void force_metadata_allocation(struct btrfs_fs_info *info) | 3871 | static void force_metadata_allocation(struct btrfs_fs_info *info) |
4281 | { | 3872 | { |
4282 | struct list_head *head = &info->space_info; | 3873 | struct list_head *head = &info->space_info; |
@@ -4290,11 +3881,6 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) | |||
4290 | rcu_read_unlock(); | 3881 | rcu_read_unlock(); |
4291 | } | 3882 | } |
4292 | 3883 | ||
4293 | static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) | ||
4294 | { | ||
4295 | return (global->size << 1); | ||
4296 | } | ||
4297 | |||
4298 | static int should_alloc_chunk(struct btrfs_fs_info *fs_info, | 3884 | static int should_alloc_chunk(struct btrfs_fs_info *fs_info, |
4299 | struct btrfs_space_info *sinfo, int force) | 3885 | struct btrfs_space_info *sinfo, int force) |
4300 | { | 3886 | { |
@@ -4325,15 +3911,9 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) | |||
4325 | { | 3911 | { |
4326 | u64 num_dev; | 3912 | u64 num_dev; |
4327 | 3913 | ||
4328 | if (type & (BTRFS_BLOCK_GROUP_RAID10 | | 3914 | num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; |
4329 | BTRFS_BLOCK_GROUP_RAID0 | | 3915 | if (!num_dev) |
4330 | BTRFS_BLOCK_GROUP_RAID5 | | ||
4331 | BTRFS_BLOCK_GROUP_RAID6)) | ||
4332 | num_dev = fs_info->fs_devices->rw_devices; | 3916 | num_dev = fs_info->fs_devices->rw_devices; |
4333 | else if (type & BTRFS_BLOCK_GROUP_RAID1) | ||
4334 | num_dev = 2; | ||
4335 | else | ||
4336 | num_dev = 1; /* DUP or single */ | ||
4337 | 3917 | ||
4338 | return num_dev; | 3918 | return num_dev; |
4339 | } | 3919 | } |
@@ -4358,7 +3938,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) | |||
4358 | */ | 3938 | */ |
4359 | lockdep_assert_held(&fs_info->chunk_mutex); | 3939 | lockdep_assert_held(&fs_info->chunk_mutex); |
4360 | 3940 | ||
4361 | info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); | 3941 | info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); |
4362 | spin_lock(&info->lock); | 3942 | spin_lock(&info->lock); |
4363 | left = info->total_bytes - btrfs_space_info_used(info, true); | 3943 | left = info->total_bytes - btrfs_space_info_used(info, true); |
4364 | spin_unlock(&info->lock); | 3944 | spin_unlock(&info->lock); |
@@ -4372,7 +3952,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) | |||
4372 | if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { | 3952 | if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { |
4373 | btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", | 3953 | btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", |
4374 | left, thresh, type); | 3954 | left, thresh, type); |
4375 | dump_space_info(fs_info, info, 0, 0); | 3955 | btrfs_dump_space_info(fs_info, info, 0, 0); |
4376 | } | 3956 | } |
4377 | 3957 | ||
4378 | if (left < thresh) { | 3958 | if (left < thresh) { |
@@ -4405,8 +3985,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) | |||
4405 | * - return 1 if it successfully allocates a chunk, | 3985 | * - return 1 if it successfully allocates a chunk, |
4406 | * - return errors including -ENOSPC otherwise. | 3986 | * - return errors including -ENOSPC otherwise. |
4407 | */ | 3987 | */ |
4408 | static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, | 3988 | int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, |
4409 | int force) | 3989 | enum btrfs_chunk_alloc_enum force) |
4410 | { | 3990 | { |
4411 | struct btrfs_fs_info *fs_info = trans->fs_info; | 3991 | struct btrfs_fs_info *fs_info = trans->fs_info; |
4412 | struct btrfs_space_info *space_info; | 3992 | struct btrfs_space_info *space_info; |
@@ -4418,7 +3998,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, | |||
4418 | if (trans->allocating_chunk) | 3998 | if (trans->allocating_chunk) |
4419 | return -ENOSPC; | 3999 | return -ENOSPC; |
4420 | 4000 | ||
4421 | space_info = __find_space_info(fs_info, flags); | 4001 | space_info = btrfs_find_space_info(fs_info, flags); |
4422 | ASSERT(space_info); | 4002 | ASSERT(space_info); |
4423 | 4003 | ||
4424 | do { | 4004 | do { |
@@ -4525,1714 +4105,6 @@ out: | |||
4525 | return ret; | 4105 | return ret; |
4526 | } | 4106 | } |
4527 | 4107 | ||
4528 | static int can_overcommit(struct btrfs_fs_info *fs_info, | ||
4529 | struct btrfs_space_info *space_info, u64 bytes, | ||
4530 | enum btrfs_reserve_flush_enum flush, | ||
4531 | bool system_chunk) | ||
4532 | { | ||
4533 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
4534 | u64 profile; | ||
4535 | u64 space_size; | ||
4536 | u64 avail; | ||
4537 | u64 used; | ||
4538 | int factor; | ||
4539 | |||
4540 | /* Don't overcommit when in mixed mode. */ | ||
4541 | if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) | ||
4542 | return 0; | ||
4543 | |||
4544 | if (system_chunk) | ||
4545 | profile = btrfs_system_alloc_profile(fs_info); | ||
4546 | else | ||
4547 | profile = btrfs_metadata_alloc_profile(fs_info); | ||
4548 | |||
4549 | used = btrfs_space_info_used(space_info, false); | ||
4550 | |||
4551 | /* | ||
4552 | * We only want to allow over committing if we have lots of actual space | ||
4553 | * free, but if we don't have enough space to handle the global reserve | ||
4554 | * space then we could end up having a real enospc problem when trying | ||
4555 | * to allocate a chunk or some other such important allocation. | ||
4556 | */ | ||
4557 | spin_lock(&global_rsv->lock); | ||
4558 | space_size = calc_global_rsv_need_space(global_rsv); | ||
4559 | spin_unlock(&global_rsv->lock); | ||
4560 | if (used + space_size >= space_info->total_bytes) | ||
4561 | return 0; | ||
4562 | |||
4563 | used += space_info->bytes_may_use; | ||
4564 | |||
4565 | avail = atomic64_read(&fs_info->free_chunk_space); | ||
4566 | |||
4567 | /* | ||
4568 | * If we have dup, raid1 or raid10 then only half of the free | ||
4569 | * space is actually usable. For raid56, the space info used | ||
4570 | * doesn't include the parity drive, so we don't have to | ||
4571 | * change the math | ||
4572 | */ | ||
4573 | factor = btrfs_bg_type_to_factor(profile); | ||
4574 | avail = div_u64(avail, factor); | ||
4575 | |||
4576 | /* | ||
4577 | * If we aren't flushing all things, let us overcommit up to | ||
4578 | * 1/2th of the space. If we can flush, don't let us overcommit | ||
4579 | * too much, let it overcommit up to 1/8 of the space. | ||
4580 | */ | ||
4581 | if (flush == BTRFS_RESERVE_FLUSH_ALL) | ||
4582 | avail >>= 3; | ||
4583 | else | ||
4584 | avail >>= 1; | ||
4585 | |||
4586 | if (used + bytes < space_info->total_bytes + avail) | ||
4587 | return 1; | ||
4588 | return 0; | ||
4589 | } | ||
4590 | |||
4591 | static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, | ||
4592 | unsigned long nr_pages, int nr_items) | ||
4593 | { | ||
4594 | struct super_block *sb = fs_info->sb; | ||
4595 | |||
4596 | if (down_read_trylock(&sb->s_umount)) { | ||
4597 | writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); | ||
4598 | up_read(&sb->s_umount); | ||
4599 | } else { | ||
4600 | /* | ||
4601 | * We needn't worry the filesystem going from r/w to r/o though | ||
4602 | * we don't acquire ->s_umount mutex, because the filesystem | ||
4603 | * should guarantee the delalloc inodes list be empty after | ||
4604 | * the filesystem is readonly(all dirty pages are written to | ||
4605 | * the disk). | ||
4606 | */ | ||
4607 | btrfs_start_delalloc_roots(fs_info, nr_items); | ||
4608 | if (!current->journal_info) | ||
4609 | btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); | ||
4610 | } | ||
4611 | } | ||
4612 | |||
4613 | static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, | ||
4614 | u64 to_reclaim) | ||
4615 | { | ||
4616 | u64 bytes; | ||
4617 | u64 nr; | ||
4618 | |||
4619 | bytes = btrfs_calc_trans_metadata_size(fs_info, 1); | ||
4620 | nr = div64_u64(to_reclaim, bytes); | ||
4621 | if (!nr) | ||
4622 | nr = 1; | ||
4623 | return nr; | ||
4624 | } | ||
4625 | |||
4626 | #define EXTENT_SIZE_PER_ITEM SZ_256K | ||
4627 | |||
4628 | /* | ||
4629 | * shrink metadata reservation for delalloc | ||
4630 | */ | ||
4631 | static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, | ||
4632 | u64 orig, bool wait_ordered) | ||
4633 | { | ||
4634 | struct btrfs_space_info *space_info; | ||
4635 | struct btrfs_trans_handle *trans; | ||
4636 | u64 delalloc_bytes; | ||
4637 | u64 dio_bytes; | ||
4638 | u64 async_pages; | ||
4639 | u64 items; | ||
4640 | long time_left; | ||
4641 | unsigned long nr_pages; | ||
4642 | int loops; | ||
4643 | |||
4644 | /* Calc the number of the pages we need flush for space reservation */ | ||
4645 | items = calc_reclaim_items_nr(fs_info, to_reclaim); | ||
4646 | to_reclaim = items * EXTENT_SIZE_PER_ITEM; | ||
4647 | |||
4648 | trans = (struct btrfs_trans_handle *)current->journal_info; | ||
4649 | space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); | ||
4650 | |||
4651 | delalloc_bytes = percpu_counter_sum_positive( | ||
4652 | &fs_info->delalloc_bytes); | ||
4653 | dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); | ||
4654 | if (delalloc_bytes == 0 && dio_bytes == 0) { | ||
4655 | if (trans) | ||
4656 | return; | ||
4657 | if (wait_ordered) | ||
4658 | btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); | ||
4659 | return; | ||
4660 | } | ||
4661 | |||
4662 | /* | ||
4663 | * If we are doing more ordered than delalloc we need to just wait on | ||
4664 | * ordered extents, otherwise we'll waste time trying to flush delalloc | ||
4665 | * that likely won't give us the space back we need. | ||
4666 | */ | ||
4667 | if (dio_bytes > delalloc_bytes) | ||
4668 | wait_ordered = true; | ||
4669 | |||
4670 | loops = 0; | ||
4671 | while ((delalloc_bytes || dio_bytes) && loops < 3) { | ||
4672 | nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; | ||
4673 | |||
4674 | /* | ||
4675 | * Triggers inode writeback for up to nr_pages. This will invoke | ||
4676 | * ->writepages callback and trigger delalloc filling | ||
4677 | * (btrfs_run_delalloc_range()). | ||
4678 | */ | ||
4679 | btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); | ||
4680 | |||
4681 | /* | ||
4682 | * We need to wait for the compressed pages to start before | ||
4683 | * we continue. | ||
4684 | */ | ||
4685 | async_pages = atomic_read(&fs_info->async_delalloc_pages); | ||
4686 | if (!async_pages) | ||
4687 | goto skip_async; | ||
4688 | |||
4689 | /* | ||
4690 | * Calculate how many compressed pages we want to be written | ||
4691 | * before we continue. I.e if there are more async pages than we | ||
4692 | * require wait_event will wait until nr_pages are written. | ||
4693 | */ | ||
4694 | if (async_pages <= nr_pages) | ||
4695 | async_pages = 0; | ||
4696 | else | ||
4697 | async_pages -= nr_pages; | ||
4698 | |||
4699 | wait_event(fs_info->async_submit_wait, | ||
4700 | atomic_read(&fs_info->async_delalloc_pages) <= | ||
4701 | (int)async_pages); | ||
4702 | skip_async: | ||
4703 | spin_lock(&space_info->lock); | ||
4704 | if (list_empty(&space_info->tickets) && | ||
4705 | list_empty(&space_info->priority_tickets)) { | ||
4706 | spin_unlock(&space_info->lock); | ||
4707 | break; | ||
4708 | } | ||
4709 | spin_unlock(&space_info->lock); | ||
4710 | |||
4711 | loops++; | ||
4712 | if (wait_ordered && !trans) { | ||
4713 | btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); | ||
4714 | } else { | ||
4715 | time_left = schedule_timeout_killable(1); | ||
4716 | if (time_left) | ||
4717 | break; | ||
4718 | } | ||
4719 | delalloc_bytes = percpu_counter_sum_positive( | ||
4720 | &fs_info->delalloc_bytes); | ||
4721 | dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); | ||
4722 | } | ||
4723 | } | ||
4724 | |||
4725 | struct reserve_ticket { | ||
4726 | u64 orig_bytes; | ||
4727 | u64 bytes; | ||
4728 | int error; | ||
4729 | struct list_head list; | ||
4730 | wait_queue_head_t wait; | ||
4731 | }; | ||
4732 | |||
4733 | /** | ||
4734 | * maybe_commit_transaction - possibly commit the transaction if its ok to | ||
4735 | * @root - the root we're allocating for | ||
4736 | * @bytes - the number of bytes we want to reserve | ||
4737 | * @force - force the commit | ||
4738 | * | ||
4739 | * This will check to make sure that committing the transaction will actually | ||
4740 | * get us somewhere and then commit the transaction if it does. Otherwise it | ||
4741 | * will return -ENOSPC. | ||
4742 | */ | ||
4743 | static int may_commit_transaction(struct btrfs_fs_info *fs_info, | ||
4744 | struct btrfs_space_info *space_info) | ||
4745 | { | ||
4746 | struct reserve_ticket *ticket = NULL; | ||
4747 | struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; | ||
4748 | struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; | ||
4749 | struct btrfs_trans_handle *trans; | ||
4750 | u64 bytes_needed; | ||
4751 | u64 reclaim_bytes = 0; | ||
4752 | |||
4753 | trans = (struct btrfs_trans_handle *)current->journal_info; | ||
4754 | if (trans) | ||
4755 | return -EAGAIN; | ||
4756 | |||
4757 | spin_lock(&space_info->lock); | ||
4758 | if (!list_empty(&space_info->priority_tickets)) | ||
4759 | ticket = list_first_entry(&space_info->priority_tickets, | ||
4760 | struct reserve_ticket, list); | ||
4761 | else if (!list_empty(&space_info->tickets)) | ||
4762 | ticket = list_first_entry(&space_info->tickets, | ||
4763 | struct reserve_ticket, list); | ||
4764 | bytes_needed = (ticket) ? ticket->bytes : 0; | ||
4765 | spin_unlock(&space_info->lock); | ||
4766 | |||
4767 | if (!bytes_needed) | ||
4768 | return 0; | ||
4769 | |||
4770 | trans = btrfs_join_transaction(fs_info->extent_root); | ||
4771 | if (IS_ERR(trans)) | ||
4772 | return PTR_ERR(trans); | ||
4773 | |||
4774 | /* | ||
4775 | * See if there is enough pinned space to make this reservation, or if | ||
4776 | * we have block groups that are going to be freed, allowing us to | ||
4777 | * possibly do a chunk allocation the next loop through. | ||
4778 | */ | ||
4779 | if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || | ||
4780 | __percpu_counter_compare(&space_info->total_bytes_pinned, | ||
4781 | bytes_needed, | ||
4782 | BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) | ||
4783 | goto commit; | ||
4784 | |||
4785 | /* | ||
4786 | * See if there is some space in the delayed insertion reservation for | ||
4787 | * this reservation. | ||
4788 | */ | ||
4789 | if (space_info != delayed_rsv->space_info) | ||
4790 | goto enospc; | ||
4791 | |||
4792 | spin_lock(&delayed_rsv->lock); | ||
4793 | reclaim_bytes += delayed_rsv->reserved; | ||
4794 | spin_unlock(&delayed_rsv->lock); | ||
4795 | |||
4796 | spin_lock(&delayed_refs_rsv->lock); | ||
4797 | reclaim_bytes += delayed_refs_rsv->reserved; | ||
4798 | spin_unlock(&delayed_refs_rsv->lock); | ||
4799 | if (reclaim_bytes >= bytes_needed) | ||
4800 | goto commit; | ||
4801 | bytes_needed -= reclaim_bytes; | ||
4802 | |||
4803 | if (__percpu_counter_compare(&space_info->total_bytes_pinned, | ||
4804 | bytes_needed, | ||
4805 | BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) | ||
4806 | goto enospc; | ||
4807 | |||
4808 | commit: | ||
4809 | return btrfs_commit_transaction(trans); | ||
4810 | enospc: | ||
4811 | btrfs_end_transaction(trans); | ||
4812 | return -ENOSPC; | ||
4813 | } | ||
4814 | |||
4815 | /* | ||
4816 | * Try to flush some data based on policy set by @state. This is only advisory | ||
4817 | * and may fail for various reasons. The caller is supposed to examine the | ||
4818 | * state of @space_info to detect the outcome. | ||
4819 | */ | ||
4820 | static void flush_space(struct btrfs_fs_info *fs_info, | ||
4821 | struct btrfs_space_info *space_info, u64 num_bytes, | ||
4822 | int state) | ||
4823 | { | ||
4824 | struct btrfs_root *root = fs_info->extent_root; | ||
4825 | struct btrfs_trans_handle *trans; | ||
4826 | int nr; | ||
4827 | int ret = 0; | ||
4828 | |||
4829 | switch (state) { | ||
4830 | case FLUSH_DELAYED_ITEMS_NR: | ||
4831 | case FLUSH_DELAYED_ITEMS: | ||
4832 | if (state == FLUSH_DELAYED_ITEMS_NR) | ||
4833 | nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; | ||
4834 | else | ||
4835 | nr = -1; | ||
4836 | |||
4837 | trans = btrfs_join_transaction(root); | ||
4838 | if (IS_ERR(trans)) { | ||
4839 | ret = PTR_ERR(trans); | ||
4840 | break; | ||
4841 | } | ||
4842 | ret = btrfs_run_delayed_items_nr(trans, nr); | ||
4843 | btrfs_end_transaction(trans); | ||
4844 | break; | ||
4845 | case FLUSH_DELALLOC: | ||
4846 | case FLUSH_DELALLOC_WAIT: | ||
4847 | shrink_delalloc(fs_info, num_bytes * 2, num_bytes, | ||
4848 | state == FLUSH_DELALLOC_WAIT); | ||
4849 | break; | ||
4850 | case FLUSH_DELAYED_REFS_NR: | ||
4851 | case FLUSH_DELAYED_REFS: | ||
4852 | trans = btrfs_join_transaction(root); | ||
4853 | if (IS_ERR(trans)) { | ||
4854 | ret = PTR_ERR(trans); | ||
4855 | break; | ||
4856 | } | ||
4857 | if (state == FLUSH_DELAYED_REFS_NR) | ||
4858 | nr = calc_reclaim_items_nr(fs_info, num_bytes); | ||
4859 | else | ||
4860 | nr = 0; | ||
4861 | btrfs_run_delayed_refs(trans, nr); | ||
4862 | btrfs_end_transaction(trans); | ||
4863 | break; | ||
4864 | case ALLOC_CHUNK: | ||
4865 | case ALLOC_CHUNK_FORCE: | ||
4866 | trans = btrfs_join_transaction(root); | ||
4867 | if (IS_ERR(trans)) { | ||
4868 | ret = PTR_ERR(trans); | ||
4869 | break; | ||
4870 | } | ||
4871 | ret = do_chunk_alloc(trans, | ||
4872 | btrfs_metadata_alloc_profile(fs_info), | ||
4873 | (state == ALLOC_CHUNK) ? | ||
4874 | CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); | ||
4875 | btrfs_end_transaction(trans); | ||
4876 | if (ret > 0 || ret == -ENOSPC) | ||
4877 | ret = 0; | ||
4878 | break; | ||
4879 | case COMMIT_TRANS: | ||
4880 | /* | ||
4881 | * If we have pending delayed iputs then we could free up a | ||
4882 | * bunch of pinned space, so make sure we run the iputs before | ||
4883 | * we do our pinned bytes check below. | ||
4884 | */ | ||
4885 | btrfs_run_delayed_iputs(fs_info); | ||
4886 | btrfs_wait_on_delayed_iputs(fs_info); | ||
4887 | |||
4888 | ret = may_commit_transaction(fs_info, space_info); | ||
4889 | break; | ||
4890 | default: | ||
4891 | ret = -ENOSPC; | ||
4892 | break; | ||
4893 | } | ||
4894 | |||
4895 | trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, | ||
4896 | ret); | ||
4897 | return; | ||
4898 | } | ||
4899 | |||
4900 | static inline u64 | ||
4901 | btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, | ||
4902 | struct btrfs_space_info *space_info, | ||
4903 | bool system_chunk) | ||
4904 | { | ||
4905 | struct reserve_ticket *ticket; | ||
4906 | u64 used; | ||
4907 | u64 expected; | ||
4908 | u64 to_reclaim = 0; | ||
4909 | |||
4910 | list_for_each_entry(ticket, &space_info->tickets, list) | ||
4911 | to_reclaim += ticket->bytes; | ||
4912 | list_for_each_entry(ticket, &space_info->priority_tickets, list) | ||
4913 | to_reclaim += ticket->bytes; | ||
4914 | if (to_reclaim) | ||
4915 | return to_reclaim; | ||
4916 | |||
4917 | to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); | ||
4918 | if (can_overcommit(fs_info, space_info, to_reclaim, | ||
4919 | BTRFS_RESERVE_FLUSH_ALL, system_chunk)) | ||
4920 | return 0; | ||
4921 | |||
4922 | used = btrfs_space_info_used(space_info, true); | ||
4923 | |||
4924 | if (can_overcommit(fs_info, space_info, SZ_1M, | ||
4925 | BTRFS_RESERVE_FLUSH_ALL, system_chunk)) | ||
4926 | expected = div_factor_fine(space_info->total_bytes, 95); | ||
4927 | else | ||
4928 | expected = div_factor_fine(space_info->total_bytes, 90); | ||
4929 | |||
4930 | if (used > expected) | ||
4931 | to_reclaim = used - expected; | ||
4932 | else | ||
4933 | to_reclaim = 0; | ||
4934 | to_reclaim = min(to_reclaim, space_info->bytes_may_use + | ||
4935 | space_info->bytes_reserved); | ||
4936 | return to_reclaim; | ||
4937 | } | ||
4938 | |||
4939 | static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, | ||
4940 | struct btrfs_space_info *space_info, | ||
4941 | u64 used, bool system_chunk) | ||
4942 | { | ||
4943 | u64 thresh = div_factor_fine(space_info->total_bytes, 98); | ||
4944 | |||
4945 | /* If we're just plain full then async reclaim just slows us down. */ | ||
4946 | if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) | ||
4947 | return 0; | ||
4948 | |||
4949 | if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, | ||
4950 | system_chunk)) | ||
4951 | return 0; | ||
4952 | |||
4953 | return (used >= thresh && !btrfs_fs_closing(fs_info) && | ||
4954 | !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); | ||
4955 | } | ||
4956 | |||
4957 | static bool wake_all_tickets(struct list_head *head) | ||
4958 | { | ||
4959 | struct reserve_ticket *ticket; | ||
4960 | |||
4961 | while (!list_empty(head)) { | ||
4962 | ticket = list_first_entry(head, struct reserve_ticket, list); | ||
4963 | list_del_init(&ticket->list); | ||
4964 | ticket->error = -ENOSPC; | ||
4965 | wake_up(&ticket->wait); | ||
4966 | if (ticket->bytes != ticket->orig_bytes) | ||
4967 | return true; | ||
4968 | } | ||
4969 | return false; | ||
4970 | } | ||
4971 | |||
4972 | /* | ||
4973 | * This is for normal flushers, we can wait all goddamned day if we want to. We | ||
4974 | * will loop and continuously try to flush as long as we are making progress. | ||
4975 | * We count progress as clearing off tickets each time we have to loop. | ||
4976 | */ | ||
4977 | static void btrfs_async_reclaim_metadata_space(struct work_struct *work) | ||
4978 | { | ||
4979 | struct btrfs_fs_info *fs_info; | ||
4980 | struct btrfs_space_info *space_info; | ||
4981 | u64 to_reclaim; | ||
4982 | int flush_state; | ||
4983 | int commit_cycles = 0; | ||
4984 | u64 last_tickets_id; | ||
4985 | |||
4986 | fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); | ||
4987 | space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); | ||
4988 | |||
4989 | spin_lock(&space_info->lock); | ||
4990 | to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, | ||
4991 | false); | ||
4992 | if (!to_reclaim) { | ||
4993 | space_info->flush = 0; | ||
4994 | spin_unlock(&space_info->lock); | ||
4995 | return; | ||
4996 | } | ||
4997 | last_tickets_id = space_info->tickets_id; | ||
4998 | spin_unlock(&space_info->lock); | ||
4999 | |||
5000 | flush_state = FLUSH_DELAYED_ITEMS_NR; | ||
5001 | do { | ||
5002 | flush_space(fs_info, space_info, to_reclaim, flush_state); | ||
5003 | spin_lock(&space_info->lock); | ||
5004 | if (list_empty(&space_info->tickets)) { | ||
5005 | space_info->flush = 0; | ||
5006 | spin_unlock(&space_info->lock); | ||
5007 | return; | ||
5008 | } | ||
5009 | to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, | ||
5010 | space_info, | ||
5011 | false); | ||
5012 | if (last_tickets_id == space_info->tickets_id) { | ||
5013 | flush_state++; | ||
5014 | } else { | ||
5015 | last_tickets_id = space_info->tickets_id; | ||
5016 | flush_state = FLUSH_DELAYED_ITEMS_NR; | ||
5017 | if (commit_cycles) | ||
5018 | commit_cycles--; | ||
5019 | } | ||
5020 | |||
5021 | /* | ||
5022 | * We don't want to force a chunk allocation until we've tried | ||
5023 | * pretty hard to reclaim space. Think of the case where we | ||
5024 | * freed up a bunch of space and so have a lot of pinned space | ||
5025 | * to reclaim. We would rather use that than possibly create a | ||
5026 | * underutilized metadata chunk. So if this is our first run | ||
5027 | * through the flushing state machine skip ALLOC_CHUNK_FORCE and | ||
5028 | * commit the transaction. If nothing has changed the next go | ||
5029 | * around then we can force a chunk allocation. | ||
5030 | */ | ||
5031 | if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) | ||
5032 | flush_state++; | ||
5033 | |||
5034 | if (flush_state > COMMIT_TRANS) { | ||
5035 | commit_cycles++; | ||
5036 | if (commit_cycles > 2) { | ||
5037 | if (wake_all_tickets(&space_info->tickets)) { | ||
5038 | flush_state = FLUSH_DELAYED_ITEMS_NR; | ||
5039 | commit_cycles--; | ||
5040 | } else { | ||
5041 | space_info->flush = 0; | ||
5042 | } | ||
5043 | } else { | ||
5044 | flush_state = FLUSH_DELAYED_ITEMS_NR; | ||
5045 | } | ||
5046 | } | ||
5047 | spin_unlock(&space_info->lock); | ||
5048 | } while (flush_state <= COMMIT_TRANS); | ||
5049 | } | ||
5050 | |||
5051 | void btrfs_init_async_reclaim_work(struct work_struct *work) | ||
5052 | { | ||
5053 | INIT_WORK(work, btrfs_async_reclaim_metadata_space); | ||
5054 | } | ||
5055 | |||
5056 | static const enum btrfs_flush_state priority_flush_states[] = { | ||
5057 | FLUSH_DELAYED_ITEMS_NR, | ||
5058 | FLUSH_DELAYED_ITEMS, | ||
5059 | ALLOC_CHUNK, | ||
5060 | }; | ||
5061 | |||
5062 | static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, | ||
5063 | struct btrfs_space_info *space_info, | ||
5064 | struct reserve_ticket *ticket) | ||
5065 | { | ||
5066 | u64 to_reclaim; | ||
5067 | int flush_state; | ||
5068 | |||
5069 | spin_lock(&space_info->lock); | ||
5070 | to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, | ||
5071 | false); | ||
5072 | if (!to_reclaim) { | ||
5073 | spin_unlock(&space_info->lock); | ||
5074 | return; | ||
5075 | } | ||
5076 | spin_unlock(&space_info->lock); | ||
5077 | |||
5078 | flush_state = 0; | ||
5079 | do { | ||
5080 | flush_space(fs_info, space_info, to_reclaim, | ||
5081 | priority_flush_states[flush_state]); | ||
5082 | flush_state++; | ||
5083 | spin_lock(&space_info->lock); | ||
5084 | if (ticket->bytes == 0) { | ||
5085 | spin_unlock(&space_info->lock); | ||
5086 | return; | ||
5087 | } | ||
5088 | spin_unlock(&space_info->lock); | ||
5089 | } while (flush_state < ARRAY_SIZE(priority_flush_states)); | ||
5090 | } | ||
5091 | |||
5092 | static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, | ||
5093 | struct btrfs_space_info *space_info, | ||
5094 | struct reserve_ticket *ticket) | ||
5095 | |||
5096 | { | ||
5097 | DEFINE_WAIT(wait); | ||
5098 | u64 reclaim_bytes = 0; | ||
5099 | int ret = 0; | ||
5100 | |||
5101 | spin_lock(&space_info->lock); | ||
5102 | while (ticket->bytes > 0 && ticket->error == 0) { | ||
5103 | ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); | ||
5104 | if (ret) { | ||
5105 | ret = -EINTR; | ||
5106 | break; | ||
5107 | } | ||
5108 | spin_unlock(&space_info->lock); | ||
5109 | |||
5110 | schedule(); | ||
5111 | |||
5112 | finish_wait(&ticket->wait, &wait); | ||
5113 | spin_lock(&space_info->lock); | ||
5114 | } | ||
5115 | if (!ret) | ||
5116 | ret = ticket->error; | ||
5117 | if (!list_empty(&ticket->list)) | ||
5118 | list_del_init(&ticket->list); | ||
5119 | if (ticket->bytes && ticket->bytes < ticket->orig_bytes) | ||
5120 | reclaim_bytes = ticket->orig_bytes - ticket->bytes; | ||
5121 | spin_unlock(&space_info->lock); | ||
5122 | |||
5123 | if (reclaim_bytes) | ||
5124 | space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); | ||
5125 | return ret; | ||
5126 | } | ||
5127 | |||
5128 | /** | ||
5129 | * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space | ||
5130 | * @root - the root we're allocating for | ||
5131 | * @space_info - the space info we want to allocate from | ||
5132 | * @orig_bytes - the number of bytes we want | ||
5133 | * @flush - whether or not we can flush to make our reservation | ||
5134 | * | ||
5135 | * This will reserve orig_bytes number of bytes from the space info associated | ||
5136 | * with the block_rsv. If there is not enough space it will make an attempt to | ||
5137 | * flush out space to make room. It will do this by flushing delalloc if | ||
5138 | * possible or committing the transaction. If flush is 0 then no attempts to | ||
5139 | * regain reservations will be made and this will fail if there is not enough | ||
5140 | * space already. | ||
5141 | */ | ||
5142 | static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, | ||
5143 | struct btrfs_space_info *space_info, | ||
5144 | u64 orig_bytes, | ||
5145 | enum btrfs_reserve_flush_enum flush, | ||
5146 | bool system_chunk) | ||
5147 | { | ||
5148 | struct reserve_ticket ticket; | ||
5149 | u64 used; | ||
5150 | u64 reclaim_bytes = 0; | ||
5151 | int ret = 0; | ||
5152 | |||
5153 | ASSERT(orig_bytes); | ||
5154 | ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); | ||
5155 | |||
5156 | spin_lock(&space_info->lock); | ||
5157 | ret = -ENOSPC; | ||
5158 | used = btrfs_space_info_used(space_info, true); | ||
5159 | |||
5160 | /* | ||
5161 | * If we have enough space then hooray, make our reservation and carry | ||
5162 | * on. If not see if we can overcommit, and if we can, hooray carry on. | ||
5163 | * If not things get more complicated. | ||
5164 | */ | ||
5165 | if (used + orig_bytes <= space_info->total_bytes) { | ||
5166 | update_bytes_may_use(space_info, orig_bytes); | ||
5167 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
5168 | space_info->flags, orig_bytes, 1); | ||
5169 | ret = 0; | ||
5170 | } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, | ||
5171 | system_chunk)) { | ||
5172 | update_bytes_may_use(space_info, orig_bytes); | ||
5173 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
5174 | space_info->flags, orig_bytes, 1); | ||
5175 | ret = 0; | ||
5176 | } | ||
5177 | |||
5178 | /* | ||
5179 | * If we couldn't make a reservation then setup our reservation ticket | ||
5180 | * and kick the async worker if it's not already running. | ||
5181 | * | ||
5182 | * If we are a priority flusher then we just need to add our ticket to | ||
5183 | * the list and we will do our own flushing further down. | ||
5184 | */ | ||
5185 | if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { | ||
5186 | ticket.orig_bytes = orig_bytes; | ||
5187 | ticket.bytes = orig_bytes; | ||
5188 | ticket.error = 0; | ||
5189 | init_waitqueue_head(&ticket.wait); | ||
5190 | if (flush == BTRFS_RESERVE_FLUSH_ALL) { | ||
5191 | list_add_tail(&ticket.list, &space_info->tickets); | ||
5192 | if (!space_info->flush) { | ||
5193 | space_info->flush = 1; | ||
5194 | trace_btrfs_trigger_flush(fs_info, | ||
5195 | space_info->flags, | ||
5196 | orig_bytes, flush, | ||
5197 | "enospc"); | ||
5198 | queue_work(system_unbound_wq, | ||
5199 | &fs_info->async_reclaim_work); | ||
5200 | } | ||
5201 | } else { | ||
5202 | list_add_tail(&ticket.list, | ||
5203 | &space_info->priority_tickets); | ||
5204 | } | ||
5205 | } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { | ||
5206 | used += orig_bytes; | ||
5207 | /* | ||
5208 | * We will do the space reservation dance during log replay, | ||
5209 | * which means we won't have fs_info->fs_root set, so don't do | ||
5210 | * the async reclaim as we will panic. | ||
5211 | */ | ||
5212 | if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && | ||
5213 | need_do_async_reclaim(fs_info, space_info, | ||
5214 | used, system_chunk) && | ||
5215 | !work_busy(&fs_info->async_reclaim_work)) { | ||
5216 | trace_btrfs_trigger_flush(fs_info, space_info->flags, | ||
5217 | orig_bytes, flush, "preempt"); | ||
5218 | queue_work(system_unbound_wq, | ||
5219 | &fs_info->async_reclaim_work); | ||
5220 | } | ||
5221 | } | ||
5222 | spin_unlock(&space_info->lock); | ||
5223 | if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) | ||
5224 | return ret; | ||
5225 | |||
5226 | if (flush == BTRFS_RESERVE_FLUSH_ALL) | ||
5227 | return wait_reserve_ticket(fs_info, space_info, &ticket); | ||
5228 | |||
5229 | ret = 0; | ||
5230 | priority_reclaim_metadata_space(fs_info, space_info, &ticket); | ||
5231 | spin_lock(&space_info->lock); | ||
5232 | if (ticket.bytes) { | ||
5233 | if (ticket.bytes < orig_bytes) | ||
5234 | reclaim_bytes = orig_bytes - ticket.bytes; | ||
5235 | list_del_init(&ticket.list); | ||
5236 | ret = -ENOSPC; | ||
5237 | } | ||
5238 | spin_unlock(&space_info->lock); | ||
5239 | |||
5240 | if (reclaim_bytes) | ||
5241 | space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); | ||
5242 | ASSERT(list_empty(&ticket.list)); | ||
5243 | return ret; | ||
5244 | } | ||
5245 | |||
5246 | /** | ||
5247 | * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space | ||
5248 | * @root - the root we're allocating for | ||
5249 | * @block_rsv - the block_rsv we're allocating for | ||
5250 | * @orig_bytes - the number of bytes we want | ||
5251 | * @flush - whether or not we can flush to make our reservation | ||
5252 | * | ||
5253 | * This will reserve orig_bytes number of bytes from the space info associated | ||
5254 | * with the block_rsv. If there is not enough space it will make an attempt to | ||
5255 | * flush out space to make room. It will do this by flushing delalloc if | ||
5256 | * possible or committing the transaction. If flush is 0 then no attempts to | ||
5257 | * regain reservations will be made and this will fail if there is not enough | ||
5258 | * space already. | ||
5259 | */ | ||
5260 | static int reserve_metadata_bytes(struct btrfs_root *root, | ||
5261 | struct btrfs_block_rsv *block_rsv, | ||
5262 | u64 orig_bytes, | ||
5263 | enum btrfs_reserve_flush_enum flush) | ||
5264 | { | ||
5265 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
5266 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
5267 | int ret; | ||
5268 | bool system_chunk = (root == fs_info->chunk_root); | ||
5269 | |||
5270 | ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, | ||
5271 | orig_bytes, flush, system_chunk); | ||
5272 | if (ret == -ENOSPC && | ||
5273 | unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { | ||
5274 | if (block_rsv != global_rsv && | ||
5275 | !block_rsv_use_bytes(global_rsv, orig_bytes)) | ||
5276 | ret = 0; | ||
5277 | } | ||
5278 | if (ret == -ENOSPC) { | ||
5279 | trace_btrfs_space_reservation(fs_info, "space_info:enospc", | ||
5280 | block_rsv->space_info->flags, | ||
5281 | orig_bytes, 1); | ||
5282 | |||
5283 | if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) | ||
5284 | dump_space_info(fs_info, block_rsv->space_info, | ||
5285 | orig_bytes, 0); | ||
5286 | } | ||
5287 | return ret; | ||
5288 | } | ||
5289 | |||
5290 | static struct btrfs_block_rsv *get_block_rsv( | ||
5291 | const struct btrfs_trans_handle *trans, | ||
5292 | const struct btrfs_root *root) | ||
5293 | { | ||
5294 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
5295 | struct btrfs_block_rsv *block_rsv = NULL; | ||
5296 | |||
5297 | if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || | ||
5298 | (root == fs_info->csum_root && trans->adding_csums) || | ||
5299 | (root == fs_info->uuid_root)) | ||
5300 | block_rsv = trans->block_rsv; | ||
5301 | |||
5302 | if (!block_rsv) | ||
5303 | block_rsv = root->block_rsv; | ||
5304 | |||
5305 | if (!block_rsv) | ||
5306 | block_rsv = &fs_info->empty_block_rsv; | ||
5307 | |||
5308 | return block_rsv; | ||
5309 | } | ||
5310 | |||
5311 | static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, | ||
5312 | u64 num_bytes) | ||
5313 | { | ||
5314 | int ret = -ENOSPC; | ||
5315 | spin_lock(&block_rsv->lock); | ||
5316 | if (block_rsv->reserved >= num_bytes) { | ||
5317 | block_rsv->reserved -= num_bytes; | ||
5318 | if (block_rsv->reserved < block_rsv->size) | ||
5319 | block_rsv->full = 0; | ||
5320 | ret = 0; | ||
5321 | } | ||
5322 | spin_unlock(&block_rsv->lock); | ||
5323 | return ret; | ||
5324 | } | ||
5325 | |||
5326 | static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, | ||
5327 | u64 num_bytes, bool update_size) | ||
5328 | { | ||
5329 | spin_lock(&block_rsv->lock); | ||
5330 | block_rsv->reserved += num_bytes; | ||
5331 | if (update_size) | ||
5332 | block_rsv->size += num_bytes; | ||
5333 | else if (block_rsv->reserved >= block_rsv->size) | ||
5334 | block_rsv->full = 1; | ||
5335 | spin_unlock(&block_rsv->lock); | ||
5336 | } | ||
5337 | |||
5338 | int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, | ||
5339 | struct btrfs_block_rsv *dest, u64 num_bytes, | ||
5340 | int min_factor) | ||
5341 | { | ||
5342 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
5343 | u64 min_bytes; | ||
5344 | |||
5345 | if (global_rsv->space_info != dest->space_info) | ||
5346 | return -ENOSPC; | ||
5347 | |||
5348 | spin_lock(&global_rsv->lock); | ||
5349 | min_bytes = div_factor(global_rsv->size, min_factor); | ||
5350 | if (global_rsv->reserved < min_bytes + num_bytes) { | ||
5351 | spin_unlock(&global_rsv->lock); | ||
5352 | return -ENOSPC; | ||
5353 | } | ||
5354 | global_rsv->reserved -= num_bytes; | ||
5355 | if (global_rsv->reserved < global_rsv->size) | ||
5356 | global_rsv->full = 0; | ||
5357 | spin_unlock(&global_rsv->lock); | ||
5358 | |||
5359 | block_rsv_add_bytes(dest, num_bytes, true); | ||
5360 | return 0; | ||
5361 | } | ||
5362 | |||
5363 | /** | ||
5364 | * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. | ||
5365 | * @fs_info - the fs info for our fs. | ||
5366 | * @src - the source block rsv to transfer from. | ||
5367 | * @num_bytes - the number of bytes to transfer. | ||
5368 | * | ||
5369 | * This transfers up to the num_bytes amount from the src rsv to the | ||
5370 | * delayed_refs_rsv. Any extra bytes are returned to the space info. | ||
5371 | */ | ||
5372 | void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, | ||
5373 | struct btrfs_block_rsv *src, | ||
5374 | u64 num_bytes) | ||
5375 | { | ||
5376 | struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; | ||
5377 | u64 to_free = 0; | ||
5378 | |||
5379 | spin_lock(&src->lock); | ||
5380 | src->reserved -= num_bytes; | ||
5381 | src->size -= num_bytes; | ||
5382 | spin_unlock(&src->lock); | ||
5383 | |||
5384 | spin_lock(&delayed_refs_rsv->lock); | ||
5385 | if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { | ||
5386 | u64 delta = delayed_refs_rsv->size - | ||
5387 | delayed_refs_rsv->reserved; | ||
5388 | if (num_bytes > delta) { | ||
5389 | to_free = num_bytes - delta; | ||
5390 | num_bytes = delta; | ||
5391 | } | ||
5392 | } else { | ||
5393 | to_free = num_bytes; | ||
5394 | num_bytes = 0; | ||
5395 | } | ||
5396 | |||
5397 | if (num_bytes) | ||
5398 | delayed_refs_rsv->reserved += num_bytes; | ||
5399 | if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) | ||
5400 | delayed_refs_rsv->full = 1; | ||
5401 | spin_unlock(&delayed_refs_rsv->lock); | ||
5402 | |||
5403 | if (num_bytes) | ||
5404 | trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", | ||
5405 | 0, num_bytes, 1); | ||
5406 | if (to_free) | ||
5407 | space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, | ||
5408 | to_free); | ||
5409 | } | ||
5410 | |||
5411 | /** | ||
5412 | * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. | ||
5413 | * @fs_info - the fs_info for our fs. | ||
5414 | * @flush - control how we can flush for this reservation. | ||
5415 | * | ||
5416 | * This will refill the delayed block_rsv up to 1 items size worth of space and | ||
5417 | * will return -ENOSPC if we can't make the reservation. | ||
5418 | */ | ||
5419 | int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, | ||
5420 | enum btrfs_reserve_flush_enum flush) | ||
5421 | { | ||
5422 | struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; | ||
5423 | u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); | ||
5424 | u64 num_bytes = 0; | ||
5425 | int ret = -ENOSPC; | ||
5426 | |||
5427 | spin_lock(&block_rsv->lock); | ||
5428 | if (block_rsv->reserved < block_rsv->size) { | ||
5429 | num_bytes = block_rsv->size - block_rsv->reserved; | ||
5430 | num_bytes = min(num_bytes, limit); | ||
5431 | } | ||
5432 | spin_unlock(&block_rsv->lock); | ||
5433 | |||
5434 | if (!num_bytes) | ||
5435 | return 0; | ||
5436 | |||
5437 | ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, | ||
5438 | num_bytes, flush); | ||
5439 | if (ret) | ||
5440 | return ret; | ||
5441 | block_rsv_add_bytes(block_rsv, num_bytes, 0); | ||
5442 | trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", | ||
5443 | 0, num_bytes, 1); | ||
5444 | return 0; | ||
5445 | } | ||
5446 | |||
5447 | /* | ||
5448 | * This is for space we already have accounted in space_info->bytes_may_use, so | ||
5449 | * basically when we're returning space from block_rsv's. | ||
5450 | */ | ||
5451 | static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, | ||
5452 | struct btrfs_space_info *space_info, | ||
5453 | u64 num_bytes) | ||
5454 | { | ||
5455 | struct reserve_ticket *ticket; | ||
5456 | struct list_head *head; | ||
5457 | u64 used; | ||
5458 | enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; | ||
5459 | bool check_overcommit = false; | ||
5460 | |||
5461 | spin_lock(&space_info->lock); | ||
5462 | head = &space_info->priority_tickets; | ||
5463 | |||
5464 | /* | ||
5465 | * If we are over our limit then we need to check and see if we can | ||
5466 | * overcommit, and if we can't then we just need to free up our space | ||
5467 | * and not satisfy any requests. | ||
5468 | */ | ||
5469 | used = btrfs_space_info_used(space_info, true); | ||
5470 | if (used - num_bytes >= space_info->total_bytes) | ||
5471 | check_overcommit = true; | ||
5472 | again: | ||
5473 | while (!list_empty(head) && num_bytes) { | ||
5474 | ticket = list_first_entry(head, struct reserve_ticket, | ||
5475 | list); | ||
5476 | /* | ||
5477 | * We use 0 bytes because this space is already reserved, so | ||
5478 | * adding the ticket space would be a double count. | ||
5479 | */ | ||
5480 | if (check_overcommit && | ||
5481 | !can_overcommit(fs_info, space_info, 0, flush, false)) | ||
5482 | break; | ||
5483 | if (num_bytes >= ticket->bytes) { | ||
5484 | list_del_init(&ticket->list); | ||
5485 | num_bytes -= ticket->bytes; | ||
5486 | ticket->bytes = 0; | ||
5487 | space_info->tickets_id++; | ||
5488 | wake_up(&ticket->wait); | ||
5489 | } else { | ||
5490 | ticket->bytes -= num_bytes; | ||
5491 | num_bytes = 0; | ||
5492 | } | ||
5493 | } | ||
5494 | |||
5495 | if (num_bytes && head == &space_info->priority_tickets) { | ||
5496 | head = &space_info->tickets; | ||
5497 | flush = BTRFS_RESERVE_FLUSH_ALL; | ||
5498 | goto again; | ||
5499 | } | ||
5500 | update_bytes_may_use(space_info, -num_bytes); | ||
5501 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
5502 | space_info->flags, num_bytes, 0); | ||
5503 | spin_unlock(&space_info->lock); | ||
5504 | } | ||
5505 | |||
5506 | /* | ||
5507 | * This is for newly allocated space that isn't accounted in | ||
5508 | * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent | ||
5509 | * we use this helper. | ||
5510 | */ | ||
5511 | static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, | ||
5512 | struct btrfs_space_info *space_info, | ||
5513 | u64 num_bytes) | ||
5514 | { | ||
5515 | struct reserve_ticket *ticket; | ||
5516 | struct list_head *head = &space_info->priority_tickets; | ||
5517 | |||
5518 | again: | ||
5519 | while (!list_empty(head) && num_bytes) { | ||
5520 | ticket = list_first_entry(head, struct reserve_ticket, | ||
5521 | list); | ||
5522 | if (num_bytes >= ticket->bytes) { | ||
5523 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
5524 | space_info->flags, | ||
5525 | ticket->bytes, 1); | ||
5526 | list_del_init(&ticket->list); | ||
5527 | num_bytes -= ticket->bytes; | ||
5528 | update_bytes_may_use(space_info, ticket->bytes); | ||
5529 | ticket->bytes = 0; | ||
5530 | space_info->tickets_id++; | ||
5531 | wake_up(&ticket->wait); | ||
5532 | } else { | ||
5533 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
5534 | space_info->flags, | ||
5535 | num_bytes, 1); | ||
5536 | update_bytes_may_use(space_info, num_bytes); | ||
5537 | ticket->bytes -= num_bytes; | ||
5538 | num_bytes = 0; | ||
5539 | } | ||
5540 | } | ||
5541 | |||
5542 | if (num_bytes && head == &space_info->priority_tickets) { | ||
5543 | head = &space_info->tickets; | ||
5544 | goto again; | ||
5545 | } | ||
5546 | } | ||
5547 | |||
5548 | static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, | ||
5549 | struct btrfs_block_rsv *block_rsv, | ||
5550 | struct btrfs_block_rsv *dest, u64 num_bytes, | ||
5551 | u64 *qgroup_to_release_ret) | ||
5552 | { | ||
5553 | struct btrfs_space_info *space_info = block_rsv->space_info; | ||
5554 | u64 qgroup_to_release = 0; | ||
5555 | u64 ret; | ||
5556 | |||
5557 | spin_lock(&block_rsv->lock); | ||
5558 | if (num_bytes == (u64)-1) { | ||
5559 | num_bytes = block_rsv->size; | ||
5560 | qgroup_to_release = block_rsv->qgroup_rsv_size; | ||
5561 | } | ||
5562 | block_rsv->size -= num_bytes; | ||
5563 | if (block_rsv->reserved >= block_rsv->size) { | ||
5564 | num_bytes = block_rsv->reserved - block_rsv->size; | ||
5565 | block_rsv->reserved = block_rsv->size; | ||
5566 | block_rsv->full = 1; | ||
5567 | } else { | ||
5568 | num_bytes = 0; | ||
5569 | } | ||
5570 | if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { | ||
5571 | qgroup_to_release = block_rsv->qgroup_rsv_reserved - | ||
5572 | block_rsv->qgroup_rsv_size; | ||
5573 | block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; | ||
5574 | } else { | ||
5575 | qgroup_to_release = 0; | ||
5576 | } | ||
5577 | spin_unlock(&block_rsv->lock); | ||
5578 | |||
5579 | ret = num_bytes; | ||
5580 | if (num_bytes > 0) { | ||
5581 | if (dest) { | ||
5582 | spin_lock(&dest->lock); | ||
5583 | if (!dest->full) { | ||
5584 | u64 bytes_to_add; | ||
5585 | |||
5586 | bytes_to_add = dest->size - dest->reserved; | ||
5587 | bytes_to_add = min(num_bytes, bytes_to_add); | ||
5588 | dest->reserved += bytes_to_add; | ||
5589 | if (dest->reserved >= dest->size) | ||
5590 | dest->full = 1; | ||
5591 | num_bytes -= bytes_to_add; | ||
5592 | } | ||
5593 | spin_unlock(&dest->lock); | ||
5594 | } | ||
5595 | if (num_bytes) | ||
5596 | space_info_add_old_bytes(fs_info, space_info, | ||
5597 | num_bytes); | ||
5598 | } | ||
5599 | if (qgroup_to_release_ret) | ||
5600 | *qgroup_to_release_ret = qgroup_to_release; | ||
5601 | return ret; | ||
5602 | } | ||
5603 | |||
5604 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, | ||
5605 | struct btrfs_block_rsv *dst, u64 num_bytes, | ||
5606 | bool update_size) | ||
5607 | { | ||
5608 | int ret; | ||
5609 | |||
5610 | ret = block_rsv_use_bytes(src, num_bytes); | ||
5611 | if (ret) | ||
5612 | return ret; | ||
5613 | |||
5614 | block_rsv_add_bytes(dst, num_bytes, update_size); | ||
5615 | return 0; | ||
5616 | } | ||
5617 | |||
5618 | void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) | ||
5619 | { | ||
5620 | memset(rsv, 0, sizeof(*rsv)); | ||
5621 | spin_lock_init(&rsv->lock); | ||
5622 | rsv->type = type; | ||
5623 | } | ||
5624 | |||
5625 | void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, | ||
5626 | struct btrfs_block_rsv *rsv, | ||
5627 | unsigned short type) | ||
5628 | { | ||
5629 | btrfs_init_block_rsv(rsv, type); | ||
5630 | rsv->space_info = __find_space_info(fs_info, | ||
5631 | BTRFS_BLOCK_GROUP_METADATA); | ||
5632 | } | ||
5633 | |||
5634 | struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, | ||
5635 | unsigned short type) | ||
5636 | { | ||
5637 | struct btrfs_block_rsv *block_rsv; | ||
5638 | |||
5639 | block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); | ||
5640 | if (!block_rsv) | ||
5641 | return NULL; | ||
5642 | |||
5643 | btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); | ||
5644 | return block_rsv; | ||
5645 | } | ||
5646 | |||
5647 | void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, | ||
5648 | struct btrfs_block_rsv *rsv) | ||
5649 | { | ||
5650 | if (!rsv) | ||
5651 | return; | ||
5652 | btrfs_block_rsv_release(fs_info, rsv, (u64)-1); | ||
5653 | kfree(rsv); | ||
5654 | } | ||
5655 | |||
5656 | int btrfs_block_rsv_add(struct btrfs_root *root, | ||
5657 | struct btrfs_block_rsv *block_rsv, u64 num_bytes, | ||
5658 | enum btrfs_reserve_flush_enum flush) | ||
5659 | { | ||
5660 | int ret; | ||
5661 | |||
5662 | if (num_bytes == 0) | ||
5663 | return 0; | ||
5664 | |||
5665 | ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); | ||
5666 | if (!ret) | ||
5667 | block_rsv_add_bytes(block_rsv, num_bytes, true); | ||
5668 | |||
5669 | return ret; | ||
5670 | } | ||
5671 | |||
5672 | int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) | ||
5673 | { | ||
5674 | u64 num_bytes = 0; | ||
5675 | int ret = -ENOSPC; | ||
5676 | |||
5677 | if (!block_rsv) | ||
5678 | return 0; | ||
5679 | |||
5680 | spin_lock(&block_rsv->lock); | ||
5681 | num_bytes = div_factor(block_rsv->size, min_factor); | ||
5682 | if (block_rsv->reserved >= num_bytes) | ||
5683 | ret = 0; | ||
5684 | spin_unlock(&block_rsv->lock); | ||
5685 | |||
5686 | return ret; | ||
5687 | } | ||
5688 | |||
5689 | int btrfs_block_rsv_refill(struct btrfs_root *root, | ||
5690 | struct btrfs_block_rsv *block_rsv, u64 min_reserved, | ||
5691 | enum btrfs_reserve_flush_enum flush) | ||
5692 | { | ||
5693 | u64 num_bytes = 0; | ||
5694 | int ret = -ENOSPC; | ||
5695 | |||
5696 | if (!block_rsv) | ||
5697 | return 0; | ||
5698 | |||
5699 | spin_lock(&block_rsv->lock); | ||
5700 | num_bytes = min_reserved; | ||
5701 | if (block_rsv->reserved >= num_bytes) | ||
5702 | ret = 0; | ||
5703 | else | ||
5704 | num_bytes -= block_rsv->reserved; | ||
5705 | spin_unlock(&block_rsv->lock); | ||
5706 | |||
5707 | if (!ret) | ||
5708 | return 0; | ||
5709 | |||
5710 | ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); | ||
5711 | if (!ret) { | ||
5712 | block_rsv_add_bytes(block_rsv, num_bytes, false); | ||
5713 | return 0; | ||
5714 | } | ||
5715 | |||
5716 | return ret; | ||
5717 | } | ||
5718 | |||
5719 | static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, | ||
5720 | struct btrfs_block_rsv *block_rsv, | ||
5721 | u64 num_bytes, u64 *qgroup_to_release) | ||
5722 | { | ||
5723 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
5724 | struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; | ||
5725 | struct btrfs_block_rsv *target = delayed_rsv; | ||
5726 | |||
5727 | if (target->full || target == block_rsv) | ||
5728 | target = global_rsv; | ||
5729 | |||
5730 | if (block_rsv->space_info != target->space_info) | ||
5731 | target = NULL; | ||
5732 | |||
5733 | return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, | ||
5734 | qgroup_to_release); | ||
5735 | } | ||
5736 | |||
5737 | void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, | ||
5738 | struct btrfs_block_rsv *block_rsv, | ||
5739 | u64 num_bytes) | ||
5740 | { | ||
5741 | __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); | ||
5742 | } | ||
5743 | |||
5744 | /** | ||
5745 | * btrfs_inode_rsv_release - release any excessive reservation. | ||
5746 | * @inode - the inode we need to release from. | ||
5747 | * @qgroup_free - free or convert qgroup meta. | ||
5748 | * Unlike normal operation, qgroup meta reservation needs to know if we are | ||
5749 | * freeing qgroup reservation or just converting it into per-trans. Normally | ||
5750 | * @qgroup_free is true for error handling, and false for normal release. | ||
5751 | * | ||
5752 | * This is the same as btrfs_block_rsv_release, except that it handles the | ||
5753 | * tracepoint for the reservation. | ||
5754 | */ | ||
5755 | static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) | ||
5756 | { | ||
5757 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||
5758 | struct btrfs_block_rsv *block_rsv = &inode->block_rsv; | ||
5759 | u64 released = 0; | ||
5760 | u64 qgroup_to_release = 0; | ||
5761 | |||
5762 | /* | ||
5763 | * Since we statically set the block_rsv->size we just want to say we | ||
5764 | * are releasing 0 bytes, and then we'll just get the reservation over | ||
5765 | * the size free'd. | ||
5766 | */ | ||
5767 | released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, | ||
5768 | &qgroup_to_release); | ||
5769 | if (released > 0) | ||
5770 | trace_btrfs_space_reservation(fs_info, "delalloc", | ||
5771 | btrfs_ino(inode), released, 0); | ||
5772 | if (qgroup_free) | ||
5773 | btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); | ||
5774 | else | ||
5775 | btrfs_qgroup_convert_reserved_meta(inode->root, | ||
5776 | qgroup_to_release); | ||
5777 | } | ||
5778 | |||
5779 | /** | ||
5780 | * btrfs_delayed_refs_rsv_release - release a ref head's reservation. | ||
5781 | * @fs_info - the fs_info for our fs. | ||
5782 | * @nr - the number of items to drop. | ||
5783 | * | ||
5784 | * This drops the delayed ref head's count from the delayed refs rsv and frees | ||
5785 | * any excess reservation we had. | ||
5786 | */ | ||
5787 | void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) | ||
5788 | { | ||
5789 | struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; | ||
5790 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
5791 | u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); | ||
5792 | u64 released = 0; | ||
5793 | |||
5794 | released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, | ||
5795 | num_bytes, NULL); | ||
5796 | if (released) | ||
5797 | trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", | ||
5798 | 0, released, 0); | ||
5799 | } | ||
5800 | |||
5801 | static void update_global_block_rsv(struct btrfs_fs_info *fs_info) | ||
5802 | { | ||
5803 | struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; | ||
5804 | struct btrfs_space_info *sinfo = block_rsv->space_info; | ||
5805 | u64 num_bytes; | ||
5806 | |||
5807 | /* | ||
5808 | * The global block rsv is based on the size of the extent tree, the | ||
5809 | * checksum tree and the root tree. If the fs is empty we want to set | ||
5810 | * it to a minimal amount for safety. | ||
5811 | */ | ||
5812 | num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + | ||
5813 | btrfs_root_used(&fs_info->csum_root->root_item) + | ||
5814 | btrfs_root_used(&fs_info->tree_root->root_item); | ||
5815 | num_bytes = max_t(u64, num_bytes, SZ_16M); | ||
5816 | |||
5817 | spin_lock(&sinfo->lock); | ||
5818 | spin_lock(&block_rsv->lock); | ||
5819 | |||
5820 | block_rsv->size = min_t(u64, num_bytes, SZ_512M); | ||
5821 | |||
5822 | if (block_rsv->reserved < block_rsv->size) { | ||
5823 | num_bytes = btrfs_space_info_used(sinfo, true); | ||
5824 | if (sinfo->total_bytes > num_bytes) { | ||
5825 | num_bytes = sinfo->total_bytes - num_bytes; | ||
5826 | num_bytes = min(num_bytes, | ||
5827 | block_rsv->size - block_rsv->reserved); | ||
5828 | block_rsv->reserved += num_bytes; | ||
5829 | update_bytes_may_use(sinfo, num_bytes); | ||
5830 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
5831 | sinfo->flags, num_bytes, | ||
5832 | 1); | ||
5833 | } | ||
5834 | } else if (block_rsv->reserved > block_rsv->size) { | ||
5835 | num_bytes = block_rsv->reserved - block_rsv->size; | ||
5836 | update_bytes_may_use(sinfo, -num_bytes); | ||
5837 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
5838 | sinfo->flags, num_bytes, 0); | ||
5839 | block_rsv->reserved = block_rsv->size; | ||
5840 | } | ||
5841 | |||
5842 | if (block_rsv->reserved == block_rsv->size) | ||
5843 | block_rsv->full = 1; | ||
5844 | else | ||
5845 | block_rsv->full = 0; | ||
5846 | |||
5847 | spin_unlock(&block_rsv->lock); | ||
5848 | spin_unlock(&sinfo->lock); | ||
5849 | } | ||
5850 | |||
5851 | static void init_global_block_rsv(struct btrfs_fs_info *fs_info) | ||
5852 | { | ||
5853 | struct btrfs_space_info *space_info; | ||
5854 | |||
5855 | space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); | ||
5856 | fs_info->chunk_block_rsv.space_info = space_info; | ||
5857 | |||
5858 | space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); | ||
5859 | fs_info->global_block_rsv.space_info = space_info; | ||
5860 | fs_info->trans_block_rsv.space_info = space_info; | ||
5861 | fs_info->empty_block_rsv.space_info = space_info; | ||
5862 | fs_info->delayed_block_rsv.space_info = space_info; | ||
5863 | fs_info->delayed_refs_rsv.space_info = space_info; | ||
5864 | |||
5865 | fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; | ||
5866 | fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; | ||
5867 | fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; | ||
5868 | fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; | ||
5869 | if (fs_info->quota_root) | ||
5870 | fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; | ||
5871 | fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; | ||
5872 | |||
5873 | update_global_block_rsv(fs_info); | ||
5874 | } | ||
5875 | |||
5876 | static void release_global_block_rsv(struct btrfs_fs_info *fs_info) | ||
5877 | { | ||
5878 | block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, | ||
5879 | (u64)-1, NULL); | ||
5880 | WARN_ON(fs_info->trans_block_rsv.size > 0); | ||
5881 | WARN_ON(fs_info->trans_block_rsv.reserved > 0); | ||
5882 | WARN_ON(fs_info->chunk_block_rsv.size > 0); | ||
5883 | WARN_ON(fs_info->chunk_block_rsv.reserved > 0); | ||
5884 | WARN_ON(fs_info->delayed_block_rsv.size > 0); | ||
5885 | WARN_ON(fs_info->delayed_block_rsv.reserved > 0); | ||
5886 | WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); | ||
5887 | WARN_ON(fs_info->delayed_refs_rsv.size > 0); | ||
5888 | } | ||
5889 | |||
5890 | /* | ||
5891 | * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv | ||
5892 | * @trans - the trans that may have generated delayed refs | ||
5893 | * | ||
5894 | * This is to be called anytime we may have adjusted trans->delayed_ref_updates, | ||
5895 | * it'll calculate the additional size and add it to the delayed_refs_rsv. | ||
5896 | */ | ||
5897 | void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) | ||
5898 | { | ||
5899 | struct btrfs_fs_info *fs_info = trans->fs_info; | ||
5900 | struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; | ||
5901 | u64 num_bytes; | ||
5902 | |||
5903 | if (!trans->delayed_ref_updates) | ||
5904 | return; | ||
5905 | |||
5906 | num_bytes = btrfs_calc_trans_metadata_size(fs_info, | ||
5907 | trans->delayed_ref_updates); | ||
5908 | spin_lock(&delayed_rsv->lock); | ||
5909 | delayed_rsv->size += num_bytes; | ||
5910 | delayed_rsv->full = 0; | ||
5911 | spin_unlock(&delayed_rsv->lock); | ||
5912 | trans->delayed_ref_updates = 0; | ||
5913 | } | ||
5914 | |||
5915 | /* | ||
5916 | * To be called after all the new block groups attached to the transaction | ||
5917 | * handle have been created (btrfs_create_pending_block_groups()). | ||
5918 | */ | ||
5919 | void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) | ||
5920 | { | ||
5921 | struct btrfs_fs_info *fs_info = trans->fs_info; | ||
5922 | |||
5923 | if (!trans->chunk_bytes_reserved) | ||
5924 | return; | ||
5925 | |||
5926 | WARN_ON_ONCE(!list_empty(&trans->new_bgs)); | ||
5927 | |||
5928 | block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, | ||
5929 | trans->chunk_bytes_reserved, NULL); | ||
5930 | trans->chunk_bytes_reserved = 0; | ||
5931 | } | ||
5932 | |||
5933 | /* | ||
5934 | * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation | ||
5935 | * root: the root of the parent directory | ||
5936 | * rsv: block reservation | ||
5937 | * items: the number of items that we need do reservation | ||
5938 | * use_global_rsv: allow fallback to the global block reservation | ||
5939 | * | ||
5940 | * This function is used to reserve the space for snapshot/subvolume | ||
5941 | * creation and deletion. Those operations are different with the | ||
5942 | * common file/directory operations, they change two fs/file trees | ||
5943 | * and root tree, the number of items that the qgroup reserves is | ||
5944 | * different with the free space reservation. So we can not use | ||
5945 | * the space reservation mechanism in start_transaction(). | ||
5946 | */ | ||
5947 | int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, | ||
5948 | struct btrfs_block_rsv *rsv, int items, | ||
5949 | bool use_global_rsv) | ||
5950 | { | ||
5951 | u64 qgroup_num_bytes = 0; | ||
5952 | u64 num_bytes; | ||
5953 | int ret; | ||
5954 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
5955 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
5956 | |||
5957 | if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { | ||
5958 | /* One for parent inode, two for dir entries */ | ||
5959 | qgroup_num_bytes = 3 * fs_info->nodesize; | ||
5960 | ret = btrfs_qgroup_reserve_meta_prealloc(root, | ||
5961 | qgroup_num_bytes, true); | ||
5962 | if (ret) | ||
5963 | return ret; | ||
5964 | } | ||
5965 | |||
5966 | num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); | ||
5967 | rsv->space_info = __find_space_info(fs_info, | ||
5968 | BTRFS_BLOCK_GROUP_METADATA); | ||
5969 | ret = btrfs_block_rsv_add(root, rsv, num_bytes, | ||
5970 | BTRFS_RESERVE_FLUSH_ALL); | ||
5971 | |||
5972 | if (ret == -ENOSPC && use_global_rsv) | ||
5973 | ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); | ||
5974 | |||
5975 | if (ret && qgroup_num_bytes) | ||
5976 | btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); | ||
5977 | |||
5978 | return ret; | ||
5979 | } | ||
5980 | |||
5981 | void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, | ||
5982 | struct btrfs_block_rsv *rsv) | ||
5983 | { | ||
5984 | btrfs_block_rsv_release(fs_info, rsv, (u64)-1); | ||
5985 | } | ||
5986 | |||
5987 | static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, | ||
5988 | struct btrfs_inode *inode) | ||
5989 | { | ||
5990 | struct btrfs_block_rsv *block_rsv = &inode->block_rsv; | ||
5991 | u64 reserve_size = 0; | ||
5992 | u64 qgroup_rsv_size = 0; | ||
5993 | u64 csum_leaves; | ||
5994 | unsigned outstanding_extents; | ||
5995 | |||
5996 | lockdep_assert_held(&inode->lock); | ||
5997 | outstanding_extents = inode->outstanding_extents; | ||
5998 | if (outstanding_extents) | ||
5999 | reserve_size = btrfs_calc_trans_metadata_size(fs_info, | ||
6000 | outstanding_extents + 1); | ||
6001 | csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, | ||
6002 | inode->csum_bytes); | ||
6003 | reserve_size += btrfs_calc_trans_metadata_size(fs_info, | ||
6004 | csum_leaves); | ||
6005 | /* | ||
6006 | * For qgroup rsv, the calculation is very simple: | ||
6007 | * account one nodesize for each outstanding extent | ||
6008 | * | ||
6009 | * This is overestimating in most cases. | ||
6010 | */ | ||
6011 | qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; | ||
6012 | |||
6013 | spin_lock(&block_rsv->lock); | ||
6014 | block_rsv->size = reserve_size; | ||
6015 | block_rsv->qgroup_rsv_size = qgroup_rsv_size; | ||
6016 | spin_unlock(&block_rsv->lock); | ||
6017 | } | ||
6018 | |||
6019 | static void calc_inode_reservations(struct btrfs_fs_info *fs_info, | ||
6020 | u64 num_bytes, u64 *meta_reserve, | ||
6021 | u64 *qgroup_reserve) | ||
6022 | { | ||
6023 | u64 nr_extents = count_max_extents(num_bytes); | ||
6024 | u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); | ||
6025 | |||
6026 | /* We add one for the inode update at finish ordered time */ | ||
6027 | *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, | ||
6028 | nr_extents + csum_leaves + 1); | ||
6029 | *qgroup_reserve = nr_extents * fs_info->nodesize; | ||
6030 | } | ||
6031 | |||
6032 | int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) | ||
6033 | { | ||
6034 | struct btrfs_root *root = inode->root; | ||
6035 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
6036 | struct btrfs_block_rsv *block_rsv = &inode->block_rsv; | ||
6037 | u64 meta_reserve, qgroup_reserve; | ||
6038 | unsigned nr_extents; | ||
6039 | enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; | ||
6040 | int ret = 0; | ||
6041 | bool delalloc_lock = true; | ||
6042 | |||
6043 | /* If we are a free space inode we need to not flush since we will be in | ||
6044 | * the middle of a transaction commit. We also don't need the delalloc | ||
6045 | * mutex since we won't race with anybody. We need this mostly to make | ||
6046 | * lockdep shut its filthy mouth. | ||
6047 | * | ||
6048 | * If we have a transaction open (can happen if we call truncate_block | ||
6049 | * from truncate), then we need FLUSH_LIMIT so we don't deadlock. | ||
6050 | */ | ||
6051 | if (btrfs_is_free_space_inode(inode)) { | ||
6052 | flush = BTRFS_RESERVE_NO_FLUSH; | ||
6053 | delalloc_lock = false; | ||
6054 | } else { | ||
6055 | if (current->journal_info) | ||
6056 | flush = BTRFS_RESERVE_FLUSH_LIMIT; | ||
6057 | |||
6058 | if (btrfs_transaction_in_commit(fs_info)) | ||
6059 | schedule_timeout(1); | ||
6060 | } | ||
6061 | |||
6062 | if (delalloc_lock) | ||
6063 | mutex_lock(&inode->delalloc_mutex); | ||
6064 | |||
6065 | num_bytes = ALIGN(num_bytes, fs_info->sectorsize); | ||
6066 | |||
6067 | /* | ||
6068 | * We always want to do it this way, every other way is wrong and ends | ||
6069 | * in tears. Pre-reserving the amount we are going to add will always | ||
6070 | * be the right way, because otherwise if we have enough parallelism we | ||
6071 | * could end up with thousands of inodes all holding little bits of | ||
6072 | * reservations they were able to make previously and the only way to | ||
6073 | * reclaim that space is to ENOSPC out the operations and clear | ||
6074 | * everything out and try again, which is bad. This way we just | ||
6075 | * over-reserve slightly, and clean up the mess when we are done. | ||
6076 | */ | ||
6077 | calc_inode_reservations(fs_info, num_bytes, &meta_reserve, | ||
6078 | &qgroup_reserve); | ||
6079 | ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); | ||
6080 | if (ret) | ||
6081 | goto out_fail; | ||
6082 | ret = reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); | ||
6083 | if (ret) | ||
6084 | goto out_qgroup; | ||
6085 | |||
6086 | /* | ||
6087 | * Now we need to update our outstanding extents and csum bytes _first_ | ||
6088 | * and then add the reservation to the block_rsv. This keeps us from | ||
6089 | * racing with an ordered completion or some such that would think it | ||
6090 | * needs to free the reservation we just made. | ||
6091 | */ | ||
6092 | spin_lock(&inode->lock); | ||
6093 | nr_extents = count_max_extents(num_bytes); | ||
6094 | btrfs_mod_outstanding_extents(inode, nr_extents); | ||
6095 | inode->csum_bytes += num_bytes; | ||
6096 | btrfs_calculate_inode_block_rsv_size(fs_info, inode); | ||
6097 | spin_unlock(&inode->lock); | ||
6098 | |||
6099 | /* Now we can safely add our space to our block rsv */ | ||
6100 | block_rsv_add_bytes(block_rsv, meta_reserve, false); | ||
6101 | trace_btrfs_space_reservation(root->fs_info, "delalloc", | ||
6102 | btrfs_ino(inode), meta_reserve, 1); | ||
6103 | |||
6104 | spin_lock(&block_rsv->lock); | ||
6105 | block_rsv->qgroup_rsv_reserved += qgroup_reserve; | ||
6106 | spin_unlock(&block_rsv->lock); | ||
6107 | |||
6108 | if (delalloc_lock) | ||
6109 | mutex_unlock(&inode->delalloc_mutex); | ||
6110 | return 0; | ||
6111 | out_qgroup: | ||
6112 | btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); | ||
6113 | out_fail: | ||
6114 | btrfs_inode_rsv_release(inode, true); | ||
6115 | if (delalloc_lock) | ||
6116 | mutex_unlock(&inode->delalloc_mutex); | ||
6117 | return ret; | ||
6118 | } | ||
6119 | |||
6120 | /** | ||
6121 | * btrfs_delalloc_release_metadata - release a metadata reservation for an inode | ||
6122 | * @inode: the inode to release the reservation for. | ||
6123 | * @num_bytes: the number of bytes we are releasing. | ||
6124 | * @qgroup_free: free qgroup reservation or convert it to per-trans reservation | ||
6125 | * | ||
6126 | * This will release the metadata reservation for an inode. This can be called | ||
6127 | * once we complete IO for a given set of bytes to release their metadata | ||
6128 | * reservations, or on error for the same reason. | ||
6129 | */ | ||
6130 | void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, | ||
6131 | bool qgroup_free) | ||
6132 | { | ||
6133 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||
6134 | |||
6135 | num_bytes = ALIGN(num_bytes, fs_info->sectorsize); | ||
6136 | spin_lock(&inode->lock); | ||
6137 | inode->csum_bytes -= num_bytes; | ||
6138 | btrfs_calculate_inode_block_rsv_size(fs_info, inode); | ||
6139 | spin_unlock(&inode->lock); | ||
6140 | |||
6141 | if (btrfs_is_testing(fs_info)) | ||
6142 | return; | ||
6143 | |||
6144 | btrfs_inode_rsv_release(inode, qgroup_free); | ||
6145 | } | ||
6146 | |||
6147 | /** | ||
6148 | * btrfs_delalloc_release_extents - release our outstanding_extents | ||
6149 | * @inode: the inode to balance the reservation for. | ||
6150 | * @num_bytes: the number of bytes we originally reserved with | ||
6151 | * @qgroup_free: do we need to free qgroup meta reservation or convert them. | ||
6152 | * | ||
6153 | * When we reserve space we increase outstanding_extents for the extents we may | ||
6154 | * add. Once we've set the range as delalloc or created our ordered extents we | ||
6155 | * have outstanding_extents to track the real usage, so we use this to free our | ||
6156 | * temporarily tracked outstanding_extents. This _must_ be used in conjunction | ||
6157 | * with btrfs_delalloc_reserve_metadata. | ||
6158 | */ | ||
6159 | void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, | ||
6160 | bool qgroup_free) | ||
6161 | { | ||
6162 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | ||
6163 | unsigned num_extents; | ||
6164 | |||
6165 | spin_lock(&inode->lock); | ||
6166 | num_extents = count_max_extents(num_bytes); | ||
6167 | btrfs_mod_outstanding_extents(inode, -num_extents); | ||
6168 | btrfs_calculate_inode_block_rsv_size(fs_info, inode); | ||
6169 | spin_unlock(&inode->lock); | ||
6170 | |||
6171 | if (btrfs_is_testing(fs_info)) | ||
6172 | return; | ||
6173 | |||
6174 | btrfs_inode_rsv_release(inode, qgroup_free); | ||
6175 | } | ||
6176 | |||
6177 | /** | ||
6178 | * btrfs_delalloc_reserve_space - reserve data and metadata space for | ||
6179 | * delalloc | ||
6180 | * @inode: inode we're writing to | ||
6181 | * @start: start range we are writing to | ||
6182 | * @len: how long the range we are writing to | ||
6183 | * @reserved: mandatory parameter, record actually reserved qgroup ranges of | ||
6184 | * current reservation. | ||
6185 | * | ||
6186 | * This will do the following things | ||
6187 | * | ||
6188 | * o reserve space in data space info for num bytes | ||
6189 | * and reserve precious corresponding qgroup space | ||
6190 | * (Done in check_data_free_space) | ||
6191 | * | ||
6192 | * o reserve space for metadata space, based on the number of outstanding | ||
6193 | * extents and how much csums will be needed | ||
6194 | * also reserve metadata space in a per root over-reserve method. | ||
6195 | * o add to the inodes->delalloc_bytes | ||
6196 | * o add it to the fs_info's delalloc inodes list. | ||
6197 | * (Above 3 all done in delalloc_reserve_metadata) | ||
6198 | * | ||
6199 | * Return 0 for success | ||
6200 | * Return <0 for error(-ENOSPC or -EQUOT) | ||
6201 | */ | ||
6202 | int btrfs_delalloc_reserve_space(struct inode *inode, | ||
6203 | struct extent_changeset **reserved, u64 start, u64 len) | ||
6204 | { | ||
6205 | int ret; | ||
6206 | |||
6207 | ret = btrfs_check_data_free_space(inode, reserved, start, len); | ||
6208 | if (ret < 0) | ||
6209 | return ret; | ||
6210 | ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); | ||
6211 | if (ret < 0) | ||
6212 | btrfs_free_reserved_data_space(inode, *reserved, start, len); | ||
6213 | return ret; | ||
6214 | } | ||
6215 | |||
6216 | /** | ||
6217 | * btrfs_delalloc_release_space - release data and metadata space for delalloc | ||
6218 | * @inode: inode we're releasing space for | ||
6219 | * @start: start position of the space already reserved | ||
6220 | * @len: the len of the space already reserved | ||
6221 | * @release_bytes: the len of the space we consumed or didn't use | ||
6222 | * | ||
6223 | * This function will release the metadata space that was not used and will | ||
6224 | * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes | ||
6225 | * list if there are no delalloc bytes left. | ||
6226 | * Also it will handle the qgroup reserved space. | ||
6227 | */ | ||
6228 | void btrfs_delalloc_release_space(struct inode *inode, | ||
6229 | struct extent_changeset *reserved, | ||
6230 | u64 start, u64 len, bool qgroup_free) | ||
6231 | { | ||
6232 | btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); | ||
6233 | btrfs_free_reserved_data_space(inode, reserved, start, len); | ||
6234 | } | ||
6235 | |||
6236 | static int update_block_group(struct btrfs_trans_handle *trans, | 4108 | static int update_block_group(struct btrfs_trans_handle *trans, |
6237 | u64 bytenr, u64 num_bytes, int alloc) | 4109 | u64 bytenr, u64 num_bytes, int alloc) |
6238 | { | 4110 | { |
@@ -6296,7 +4168,8 @@ static int update_block_group(struct btrfs_trans_handle *trans, | |||
6296 | old_val -= num_bytes; | 4168 | old_val -= num_bytes; |
6297 | btrfs_set_block_group_used(&cache->item, old_val); | 4169 | btrfs_set_block_group_used(&cache->item, old_val); |
6298 | cache->pinned += num_bytes; | 4170 | cache->pinned += num_bytes; |
6299 | update_bytes_pinned(cache->space_info, num_bytes); | 4171 | btrfs_space_info_update_bytes_pinned(info, |
4172 | cache->space_info, num_bytes); | ||
6300 | cache->space_info->bytes_used -= num_bytes; | 4173 | cache->space_info->bytes_used -= num_bytes; |
6301 | cache->space_info->disk_used -= num_bytes * factor; | 4174 | cache->space_info->disk_used -= num_bytes * factor; |
6302 | spin_unlock(&cache->lock); | 4175 | spin_unlock(&cache->lock); |
@@ -6371,7 +4244,8 @@ static int pin_down_extent(struct btrfs_block_group_cache *cache, | |||
6371 | spin_lock(&cache->space_info->lock); | 4244 | spin_lock(&cache->space_info->lock); |
6372 | spin_lock(&cache->lock); | 4245 | spin_lock(&cache->lock); |
6373 | cache->pinned += num_bytes; | 4246 | cache->pinned += num_bytes; |
6374 | update_bytes_pinned(cache->space_info, num_bytes); | 4247 | btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, |
4248 | num_bytes); | ||
6375 | if (reserved) { | 4249 | if (reserved) { |
6376 | cache->reserved -= num_bytes; | 4250 | cache->reserved -= num_bytes; |
6377 | cache->space_info->bytes_reserved -= num_bytes; | 4251 | cache->space_info->bytes_reserved -= num_bytes; |
@@ -6580,7 +4454,8 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, | |||
6580 | } else { | 4454 | } else { |
6581 | cache->reserved += num_bytes; | 4455 | cache->reserved += num_bytes; |
6582 | space_info->bytes_reserved += num_bytes; | 4456 | space_info->bytes_reserved += num_bytes; |
6583 | update_bytes_may_use(space_info, -ram_bytes); | 4457 | btrfs_space_info_update_bytes_may_use(cache->fs_info, |
4458 | space_info, -ram_bytes); | ||
6584 | if (delalloc) | 4459 | if (delalloc) |
6585 | cache->delalloc_bytes += num_bytes; | 4460 | cache->delalloc_bytes += num_bytes; |
6586 | } | 4461 | } |
@@ -6646,7 +4521,7 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) | |||
6646 | 4521 | ||
6647 | up_write(&fs_info->commit_root_sem); | 4522 | up_write(&fs_info->commit_root_sem); |
6648 | 4523 | ||
6649 | update_global_block_rsv(fs_info); | 4524 | btrfs_update_global_block_rsv(fs_info); |
6650 | } | 4525 | } |
6651 | 4526 | ||
6652 | /* | 4527 | /* |
@@ -6736,7 +4611,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, | |||
6736 | spin_lock(&space_info->lock); | 4611 | spin_lock(&space_info->lock); |
6737 | spin_lock(&cache->lock); | 4612 | spin_lock(&cache->lock); |
6738 | cache->pinned -= len; | 4613 | cache->pinned -= len; |
6739 | update_bytes_pinned(space_info, -len); | 4614 | btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); |
6740 | 4615 | ||
6741 | trace_btrfs_space_reservation(fs_info, "pinned", | 4616 | trace_btrfs_space_reservation(fs_info, "pinned", |
6742 | space_info->flags, len, 0); | 4617 | space_info->flags, len, 0); |
@@ -6757,7 +4632,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, | |||
6757 | to_add = min(len, global_rsv->size - | 4632 | to_add = min(len, global_rsv->size - |
6758 | global_rsv->reserved); | 4633 | global_rsv->reserved); |
6759 | global_rsv->reserved += to_add; | 4634 | global_rsv->reserved += to_add; |
6760 | update_bytes_may_use(space_info, to_add); | 4635 | btrfs_space_info_update_bytes_may_use(fs_info, |
4636 | space_info, to_add); | ||
6761 | if (global_rsv->reserved >= global_rsv->size) | 4637 | if (global_rsv->reserved >= global_rsv->size) |
6762 | global_rsv->full = 1; | 4638 | global_rsv->full = 1; |
6763 | trace_btrfs_space_reservation(fs_info, | 4639 | trace_btrfs_space_reservation(fs_info, |
@@ -6769,8 +4645,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, | |||
6769 | spin_unlock(&global_rsv->lock); | 4645 | spin_unlock(&global_rsv->lock); |
6770 | /* Add to any tickets we may have */ | 4646 | /* Add to any tickets we may have */ |
6771 | if (len) | 4647 | if (len) |
6772 | space_info_add_new_bytes(fs_info, space_info, | 4648 | btrfs_space_info_add_new_bytes(fs_info, |
6773 | len); | 4649 | space_info, len); |
6774 | } | 4650 | } |
6775 | spin_unlock(&space_info->lock); | 4651 | spin_unlock(&space_info->lock); |
6776 | } | 4652 | } |
@@ -7191,7 +5067,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, | |||
7191 | } | 5067 | } |
7192 | out: | 5068 | out: |
7193 | if (pin) | 5069 | if (pin) |
7194 | add_pinned_bytes(fs_info, &generic_ref, 1); | 5070 | add_pinned_bytes(fs_info, &generic_ref); |
7195 | 5071 | ||
7196 | if (last_ref) { | 5072 | if (last_ref) { |
7197 | /* | 5073 | /* |
@@ -7239,7 +5115,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) | |||
7239 | btrfs_ref_tree_mod(fs_info, ref); | 5115 | btrfs_ref_tree_mod(fs_info, ref); |
7240 | 5116 | ||
7241 | if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) | 5117 | if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) |
7242 | add_pinned_bytes(fs_info, ref, 1); | 5118 | add_pinned_bytes(fs_info, ref); |
7243 | 5119 | ||
7244 | return ret; | 5120 | return ret; |
7245 | } | 5121 | } |
@@ -7292,10 +5168,10 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) | |||
7292 | } | 5168 | } |
7293 | 5169 | ||
7294 | enum btrfs_loop_type { | 5170 | enum btrfs_loop_type { |
7295 | LOOP_CACHING_NOWAIT = 0, | 5171 | LOOP_CACHING_NOWAIT, |
7296 | LOOP_CACHING_WAIT = 1, | 5172 | LOOP_CACHING_WAIT, |
7297 | LOOP_ALLOC_CHUNK = 2, | 5173 | LOOP_ALLOC_CHUNK, |
7298 | LOOP_NO_EMPTY_SIZE = 3, | 5174 | LOOP_NO_EMPTY_SIZE, |
7299 | }; | 5175 | }; |
7300 | 5176 | ||
7301 | static inline void | 5177 | static inline void |
@@ -7661,8 +5537,8 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, | |||
7661 | return ret; | 5537 | return ret; |
7662 | } | 5538 | } |
7663 | 5539 | ||
7664 | ret = do_chunk_alloc(trans, ffe_ctl->flags, | 5540 | ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, |
7665 | CHUNK_ALLOC_FORCE); | 5541 | CHUNK_ALLOC_FORCE); |
7666 | 5542 | ||
7667 | /* | 5543 | /* |
7668 | * If we can't allocate a new chunk we've already looped | 5544 | * If we can't allocate a new chunk we've already looped |
@@ -7758,7 +5634,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, | |||
7758 | 5634 | ||
7759 | trace_find_free_extent(fs_info, num_bytes, empty_size, flags); | 5635 | trace_find_free_extent(fs_info, num_bytes, empty_size, flags); |
7760 | 5636 | ||
7761 | space_info = __find_space_info(fs_info, flags); | 5637 | space_info = btrfs_find_space_info(fs_info, flags); |
7762 | if (!space_info) { | 5638 | if (!space_info) { |
7763 | btrfs_err(fs_info, "No space info for %llu", flags); | 5639 | btrfs_err(fs_info, "No space info for %llu", flags); |
7764 | return -ENOSPC; | 5640 | return -ENOSPC; |
@@ -7863,9 +5739,8 @@ search: | |||
7863 | */ | 5739 | */ |
7864 | if (!block_group_bits(block_group, flags)) { | 5740 | if (!block_group_bits(block_group, flags)) { |
7865 | u64 extra = BTRFS_BLOCK_GROUP_DUP | | 5741 | u64 extra = BTRFS_BLOCK_GROUP_DUP | |
7866 | BTRFS_BLOCK_GROUP_RAID1 | | 5742 | BTRFS_BLOCK_GROUP_RAID1_MASK | |
7867 | BTRFS_BLOCK_GROUP_RAID5 | | 5743 | BTRFS_BLOCK_GROUP_RAID56_MASK | |
7868 | BTRFS_BLOCK_GROUP_RAID6 | | ||
7869 | BTRFS_BLOCK_GROUP_RAID10; | 5744 | BTRFS_BLOCK_GROUP_RAID10; |
7870 | 5745 | ||
7871 | /* | 5746 | /* |
@@ -7984,60 +5859,6 @@ loop: | |||
7984 | return ret; | 5859 | return ret; |
7985 | } | 5860 | } |
7986 | 5861 | ||
7987 | #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ | ||
7988 | do { \ | ||
7989 | struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ | ||
7990 | spin_lock(&__rsv->lock); \ | ||
7991 | btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ | ||
7992 | __rsv->size, __rsv->reserved); \ | ||
7993 | spin_unlock(&__rsv->lock); \ | ||
7994 | } while (0) | ||
7995 | |||
7996 | static void dump_space_info(struct btrfs_fs_info *fs_info, | ||
7997 | struct btrfs_space_info *info, u64 bytes, | ||
7998 | int dump_block_groups) | ||
7999 | { | ||
8000 | struct btrfs_block_group_cache *cache; | ||
8001 | int index = 0; | ||
8002 | |||
8003 | spin_lock(&info->lock); | ||
8004 | btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", | ||
8005 | info->flags, | ||
8006 | info->total_bytes - btrfs_space_info_used(info, true), | ||
8007 | info->full ? "" : "not "); | ||
8008 | btrfs_info(fs_info, | ||
8009 | "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", | ||
8010 | info->total_bytes, info->bytes_used, info->bytes_pinned, | ||
8011 | info->bytes_reserved, info->bytes_may_use, | ||
8012 | info->bytes_readonly); | ||
8013 | spin_unlock(&info->lock); | ||
8014 | |||
8015 | DUMP_BLOCK_RSV(fs_info, global_block_rsv); | ||
8016 | DUMP_BLOCK_RSV(fs_info, trans_block_rsv); | ||
8017 | DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); | ||
8018 | DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); | ||
8019 | DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); | ||
8020 | |||
8021 | if (!dump_block_groups) | ||
8022 | return; | ||
8023 | |||
8024 | down_read(&info->groups_sem); | ||
8025 | again: | ||
8026 | list_for_each_entry(cache, &info->block_groups[index], list) { | ||
8027 | spin_lock(&cache->lock); | ||
8028 | btrfs_info(fs_info, | ||
8029 | "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", | ||
8030 | cache->key.objectid, cache->key.offset, | ||
8031 | btrfs_block_group_used(&cache->item), cache->pinned, | ||
8032 | cache->reserved, cache->ro ? "[readonly]" : ""); | ||
8033 | btrfs_dump_free_space(cache, bytes); | ||
8034 | spin_unlock(&cache->lock); | ||
8035 | } | ||
8036 | if (++index < BTRFS_NR_RAID_TYPES) | ||
8037 | goto again; | ||
8038 | up_read(&info->groups_sem); | ||
8039 | } | ||
8040 | |||
8041 | /* | 5862 | /* |
8042 | * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a | 5863 | * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a |
8043 | * hole that is at least as big as @num_bytes. | 5864 | * hole that is at least as big as @num_bytes. |
@@ -8113,12 +5934,13 @@ again: | |||
8113 | } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { | 5934 | } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { |
8114 | struct btrfs_space_info *sinfo; | 5935 | struct btrfs_space_info *sinfo; |
8115 | 5936 | ||
8116 | sinfo = __find_space_info(fs_info, flags); | 5937 | sinfo = btrfs_find_space_info(fs_info, flags); |
8117 | btrfs_err(fs_info, | 5938 | btrfs_err(fs_info, |
8118 | "allocation failed flags %llu, wanted %llu", | 5939 | "allocation failed flags %llu, wanted %llu", |
8119 | flags, num_bytes); | 5940 | flags, num_bytes); |
8120 | if (sinfo) | 5941 | if (sinfo) |
8121 | dump_space_info(fs_info, sinfo, num_bytes, 1); | 5942 | btrfs_dump_space_info(fs_info, sinfo, |
5943 | num_bytes, 1); | ||
8122 | } | 5944 | } |
8123 | } | 5945 | } |
8124 | 5946 | ||
@@ -8456,73 +6278,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
8456 | return buf; | 6278 | return buf; |
8457 | } | 6279 | } |
8458 | 6280 | ||
8459 | static struct btrfs_block_rsv * | ||
8460 | use_block_rsv(struct btrfs_trans_handle *trans, | ||
8461 | struct btrfs_root *root, u32 blocksize) | ||
8462 | { | ||
8463 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
8464 | struct btrfs_block_rsv *block_rsv; | ||
8465 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
8466 | int ret; | ||
8467 | bool global_updated = false; | ||
8468 | |||
8469 | block_rsv = get_block_rsv(trans, root); | ||
8470 | |||
8471 | if (unlikely(block_rsv->size == 0)) | ||
8472 | goto try_reserve; | ||
8473 | again: | ||
8474 | ret = block_rsv_use_bytes(block_rsv, blocksize); | ||
8475 | if (!ret) | ||
8476 | return block_rsv; | ||
8477 | |||
8478 | if (block_rsv->failfast) | ||
8479 | return ERR_PTR(ret); | ||
8480 | |||
8481 | if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { | ||
8482 | global_updated = true; | ||
8483 | update_global_block_rsv(fs_info); | ||
8484 | goto again; | ||
8485 | } | ||
8486 | |||
8487 | /* | ||
8488 | * The global reserve still exists to save us from ourselves, so don't | ||
8489 | * warn_on if we are short on our delayed refs reserve. | ||
8490 | */ | ||
8491 | if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && | ||
8492 | btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { | ||
8493 | static DEFINE_RATELIMIT_STATE(_rs, | ||
8494 | DEFAULT_RATELIMIT_INTERVAL * 10, | ||
8495 | /*DEFAULT_RATELIMIT_BURST*/ 1); | ||
8496 | if (__ratelimit(&_rs)) | ||
8497 | WARN(1, KERN_DEBUG | ||
8498 | "BTRFS: block rsv returned %d\n", ret); | ||
8499 | } | ||
8500 | try_reserve: | ||
8501 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, | ||
8502 | BTRFS_RESERVE_NO_FLUSH); | ||
8503 | if (!ret) | ||
8504 | return block_rsv; | ||
8505 | /* | ||
8506 | * If we couldn't reserve metadata bytes try and use some from | ||
8507 | * the global reserve if its space type is the same as the global | ||
8508 | * reservation. | ||
8509 | */ | ||
8510 | if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && | ||
8511 | block_rsv->space_info == global_rsv->space_info) { | ||
8512 | ret = block_rsv_use_bytes(global_rsv, blocksize); | ||
8513 | if (!ret) | ||
8514 | return global_rsv; | ||
8515 | } | ||
8516 | return ERR_PTR(ret); | ||
8517 | } | ||
8518 | |||
8519 | static void unuse_block_rsv(struct btrfs_fs_info *fs_info, | ||
8520 | struct btrfs_block_rsv *block_rsv, u32 blocksize) | ||
8521 | { | ||
8522 | block_rsv_add_bytes(block_rsv, blocksize, false); | ||
8523 | block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); | ||
8524 | } | ||
8525 | |||
8526 | /* | 6281 | /* |
8527 | * finds a free extent and does all the dirty work required for allocation | 6282 | * finds a free extent and does all the dirty work required for allocation |
8528 | * returns the tree buffer or an ERR_PTR on error. | 6283 | * returns the tree buffer or an ERR_PTR on error. |
@@ -8555,7 +6310,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, | |||
8555 | } | 6310 | } |
8556 | #endif | 6311 | #endif |
8557 | 6312 | ||
8558 | block_rsv = use_block_rsv(trans, root, blocksize); | 6313 | block_rsv = btrfs_use_block_rsv(trans, root, blocksize); |
8559 | if (IS_ERR(block_rsv)) | 6314 | if (IS_ERR(block_rsv)) |
8560 | return ERR_CAST(block_rsv); | 6315 | return ERR_CAST(block_rsv); |
8561 | 6316 | ||
@@ -8613,7 +6368,7 @@ out_free_buf: | |||
8613 | out_free_reserved: | 6368 | out_free_reserved: |
8614 | btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); | 6369 | btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); |
8615 | out_unuse: | 6370 | out_unuse: |
8616 | unuse_block_rsv(fs_info, block_rsv, blocksize); | 6371 | btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); |
8617 | return ERR_PTR(ret); | 6372 | return ERR_PTR(ret); |
8618 | } | 6373 | } |
8619 | 6374 | ||
@@ -9552,9 +7307,8 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) | |||
9552 | 7307 | ||
9553 | num_devices = fs_info->fs_devices->rw_devices; | 7308 | num_devices = fs_info->fs_devices->rw_devices; |
9554 | 7309 | ||
9555 | stripped = BTRFS_BLOCK_GROUP_RAID0 | | 7310 | stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | |
9556 | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | | 7311 | BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; |
9557 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; | ||
9558 | 7312 | ||
9559 | if (num_devices == 1) { | 7313 | if (num_devices == 1) { |
9560 | stripped |= BTRFS_BLOCK_GROUP_DUP; | 7314 | stripped |= BTRFS_BLOCK_GROUP_DUP; |
@@ -9565,7 +7319,7 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) | |||
9565 | return stripped; | 7319 | return stripped; |
9566 | 7320 | ||
9567 | /* turn mirroring into duplication */ | 7321 | /* turn mirroring into duplication */ |
9568 | if (flags & (BTRFS_BLOCK_GROUP_RAID1 | | 7322 | if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | |
9569 | BTRFS_BLOCK_GROUP_RAID10)) | 7323 | BTRFS_BLOCK_GROUP_RAID10)) |
9570 | return stripped | BTRFS_BLOCK_GROUP_DUP; | 7324 | return stripped | BTRFS_BLOCK_GROUP_DUP; |
9571 | } else { | 7325 | } else { |
@@ -9636,7 +7390,7 @@ out: | |||
9636 | btrfs_info(cache->fs_info, | 7390 | btrfs_info(cache->fs_info, |
9637 | "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", | 7391 | "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", |
9638 | sinfo_used, num_bytes, min_allocable_bytes); | 7392 | sinfo_used, num_bytes, min_allocable_bytes); |
9639 | dump_space_info(cache->fs_info, cache->space_info, 0, 0); | 7393 | btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); |
9640 | } | 7394 | } |
9641 | return ret; | 7395 | return ret; |
9642 | } | 7396 | } |
@@ -9678,8 +7432,7 @@ again: | |||
9678 | */ | 7432 | */ |
9679 | alloc_flags = update_block_group_flags(fs_info, cache->flags); | 7433 | alloc_flags = update_block_group_flags(fs_info, cache->flags); |
9680 | if (alloc_flags != cache->flags) { | 7434 | if (alloc_flags != cache->flags) { |
9681 | ret = do_chunk_alloc(trans, alloc_flags, | 7435 | ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); |
9682 | CHUNK_ALLOC_FORCE); | ||
9683 | /* | 7436 | /* |
9684 | * ENOSPC is allowed here, we may have enough space | 7437 | * ENOSPC is allowed here, we may have enough space |
9685 | * already allocated at the new raid level to | 7438 | * already allocated at the new raid level to |
@@ -9695,7 +7448,7 @@ again: | |||
9695 | if (!ret) | 7448 | if (!ret) |
9696 | goto out; | 7449 | goto out; |
9697 | alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); | 7450 | alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); |
9698 | ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); | 7451 | ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); |
9699 | if (ret < 0) | 7452 | if (ret < 0) |
9700 | goto out; | 7453 | goto out; |
9701 | ret = inc_block_group_ro(cache, 0); | 7454 | ret = inc_block_group_ro(cache, 0); |
@@ -9716,7 +7469,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) | |||
9716 | { | 7469 | { |
9717 | u64 alloc_flags = get_alloc_profile(trans->fs_info, type); | 7470 | u64 alloc_flags = get_alloc_profile(trans->fs_info, type); |
9718 | 7471 | ||
9719 | return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); | 7472 | return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); |
9720 | } | 7473 | } |
9721 | 7474 | ||
9722 | /* | 7475 | /* |
@@ -9949,7 +7702,7 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, | |||
9949 | struct extent_map_tree *em_tree; | 7702 | struct extent_map_tree *em_tree; |
9950 | struct extent_map *em; | 7703 | struct extent_map *em; |
9951 | 7704 | ||
9952 | em_tree = &root->fs_info->mapping_tree.map_tree; | 7705 | em_tree = &root->fs_info->mapping_tree; |
9953 | read_lock(&em_tree->lock); | 7706 | read_lock(&em_tree->lock); |
9954 | em = lookup_extent_mapping(em_tree, found_key.objectid, | 7707 | em = lookup_extent_mapping(em_tree, found_key.objectid, |
9955 | found_key.offset); | 7708 | found_key.offset); |
@@ -10102,7 +7855,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) | |||
10102 | */ | 7855 | */ |
10103 | synchronize_rcu(); | 7856 | synchronize_rcu(); |
10104 | 7857 | ||
10105 | release_global_block_rsv(info); | 7858 | btrfs_release_global_block_rsv(info); |
10106 | 7859 | ||
10107 | while (!list_empty(&info->space_info)) { | 7860 | while (!list_empty(&info->space_info)) { |
10108 | int i; | 7861 | int i; |
@@ -10118,7 +7871,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) | |||
10118 | if (WARN_ON(space_info->bytes_pinned > 0 || | 7871 | if (WARN_ON(space_info->bytes_pinned > 0 || |
10119 | space_info->bytes_reserved > 0 || | 7872 | space_info->bytes_reserved > 0 || |
10120 | space_info->bytes_may_use > 0)) | 7873 | space_info->bytes_may_use > 0)) |
10121 | dump_space_info(info, space_info, 0, 0); | 7874 | btrfs_dump_space_info(info, space_info, 0, 0); |
10122 | list_del(&space_info->list); | 7875 | list_del(&space_info->list); |
10123 | for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { | 7876 | for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { |
10124 | struct kobject *kobj; | 7877 | struct kobject *kobj; |
@@ -10141,7 +7894,6 @@ void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) | |||
10141 | struct btrfs_space_info *space_info; | 7894 | struct btrfs_space_info *space_info; |
10142 | struct raid_kobject *rkobj; | 7895 | struct raid_kobject *rkobj; |
10143 | LIST_HEAD(list); | 7896 | LIST_HEAD(list); |
10144 | int index; | ||
10145 | int ret = 0; | 7897 | int ret = 0; |
10146 | 7898 | ||
10147 | spin_lock(&fs_info->pending_raid_kobjs_lock); | 7899 | spin_lock(&fs_info->pending_raid_kobjs_lock); |
@@ -10149,11 +7901,10 @@ void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) | |||
10149 | spin_unlock(&fs_info->pending_raid_kobjs_lock); | 7901 | spin_unlock(&fs_info->pending_raid_kobjs_lock); |
10150 | 7902 | ||
10151 | list_for_each_entry(rkobj, &list, list) { | 7903 | list_for_each_entry(rkobj, &list, list) { |
10152 | space_info = __find_space_info(fs_info, rkobj->flags); | 7904 | space_info = btrfs_find_space_info(fs_info, rkobj->flags); |
10153 | index = btrfs_bg_flags_to_raid_index(rkobj->flags); | ||
10154 | 7905 | ||
10155 | ret = kobject_add(&rkobj->kobj, &space_info->kobj, | 7906 | ret = kobject_add(&rkobj->kobj, &space_info->kobj, |
10156 | "%s", get_raid_name(index)); | 7907 | "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); |
10157 | if (ret) { | 7908 | if (ret) { |
10158 | kobject_put(&rkobj->kobj); | 7909 | kobject_put(&rkobj->kobj); |
10159 | break; | 7910 | break; |
@@ -10243,21 +7994,21 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, | |||
10243 | */ | 7994 | */ |
10244 | static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) | 7995 | static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) |
10245 | { | 7996 | { |
10246 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | 7997 | struct extent_map_tree *map_tree = &fs_info->mapping_tree; |
10247 | struct extent_map *em; | 7998 | struct extent_map *em; |
10248 | struct btrfs_block_group_cache *bg; | 7999 | struct btrfs_block_group_cache *bg; |
10249 | u64 start = 0; | 8000 | u64 start = 0; |
10250 | int ret = 0; | 8001 | int ret = 0; |
10251 | 8002 | ||
10252 | while (1) { | 8003 | while (1) { |
10253 | read_lock(&map_tree->map_tree.lock); | 8004 | read_lock(&map_tree->lock); |
10254 | /* | 8005 | /* |
10255 | * lookup_extent_mapping will return the first extent map | 8006 | * lookup_extent_mapping will return the first extent map |
10256 | * intersecting the range, so setting @len to 1 is enough to | 8007 | * intersecting the range, so setting @len to 1 is enough to |
10257 | * get the first chunk. | 8008 | * get the first chunk. |
10258 | */ | 8009 | */ |
10259 | em = lookup_extent_mapping(&map_tree->map_tree, start, 1); | 8010 | em = lookup_extent_mapping(map_tree, start, 1); |
10260 | read_unlock(&map_tree->map_tree.lock); | 8011 | read_unlock(&map_tree->lock); |
10261 | if (!em) | 8012 | if (!em) |
10262 | break; | 8013 | break; |
10263 | 8014 | ||
@@ -10417,9 +8168,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) | |||
10417 | } | 8168 | } |
10418 | 8169 | ||
10419 | trace_btrfs_add_block_group(info, cache, 0); | 8170 | trace_btrfs_add_block_group(info, cache, 0); |
10420 | update_space_info(info, cache->flags, found_key.offset, | 8171 | btrfs_update_space_info(info, cache->flags, found_key.offset, |
10421 | btrfs_block_group_used(&cache->item), | 8172 | btrfs_block_group_used(&cache->item), |
10422 | cache->bytes_super, &space_info); | 8173 | cache->bytes_super, &space_info); |
10423 | 8174 | ||
10424 | cache->space_info = space_info; | 8175 | cache->space_info = space_info; |
10425 | 8176 | ||
@@ -10437,9 +8188,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) | |||
10437 | list_for_each_entry_rcu(space_info, &info->space_info, list) { | 8188 | list_for_each_entry_rcu(space_info, &info->space_info, list) { |
10438 | if (!(get_alloc_profile(info, space_info->flags) & | 8189 | if (!(get_alloc_profile(info, space_info->flags) & |
10439 | (BTRFS_BLOCK_GROUP_RAID10 | | 8190 | (BTRFS_BLOCK_GROUP_RAID10 | |
10440 | BTRFS_BLOCK_GROUP_RAID1 | | 8191 | BTRFS_BLOCK_GROUP_RAID1_MASK | |
10441 | BTRFS_BLOCK_GROUP_RAID5 | | 8192 | BTRFS_BLOCK_GROUP_RAID56_MASK | |
10442 | BTRFS_BLOCK_GROUP_RAID6 | | ||
10443 | BTRFS_BLOCK_GROUP_DUP))) | 8193 | BTRFS_BLOCK_GROUP_DUP))) |
10444 | continue; | 8194 | continue; |
10445 | /* | 8195 | /* |
@@ -10457,7 +8207,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) | |||
10457 | } | 8207 | } |
10458 | 8208 | ||
10459 | btrfs_add_raid_kobjects(info); | 8209 | btrfs_add_raid_kobjects(info); |
10460 | init_global_block_rsv(info); | 8210 | btrfs_init_global_block_rsv(info); |
10461 | ret = check_chunk_block_group_mappings(info); | 8211 | ret = check_chunk_block_group_mappings(info); |
10462 | error: | 8212 | error: |
10463 | btrfs_free_path(path); | 8213 | btrfs_free_path(path); |
@@ -10554,7 +8304,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, | |||
10554 | * assigned to our block group. We want our bg to be added to the rbtree | 8304 | * assigned to our block group. We want our bg to be added to the rbtree |
10555 | * with its ->space_info set. | 8305 | * with its ->space_info set. |
10556 | */ | 8306 | */ |
10557 | cache->space_info = __find_space_info(fs_info, cache->flags); | 8307 | cache->space_info = btrfs_find_space_info(fs_info, cache->flags); |
10558 | ASSERT(cache->space_info); | 8308 | ASSERT(cache->space_info); |
10559 | 8309 | ||
10560 | ret = btrfs_add_block_group_cache(fs_info, cache); | 8310 | ret = btrfs_add_block_group_cache(fs_info, cache); |
@@ -10569,9 +8319,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, | |||
10569 | * the rbtree, update the space info's counters. | 8319 | * the rbtree, update the space info's counters. |
10570 | */ | 8320 | */ |
10571 | trace_btrfs_add_block_group(fs_info, cache, 1); | 8321 | trace_btrfs_add_block_group(fs_info, cache, 1); |
10572 | update_space_info(fs_info, cache->flags, size, bytes_used, | 8322 | btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, |
10573 | cache->bytes_super, &cache->space_info); | 8323 | cache->bytes_super, &cache->space_info); |
10574 | update_global_block_rsv(fs_info); | 8324 | btrfs_update_global_block_rsv(fs_info); |
10575 | 8325 | ||
10576 | link_block_group(cache); | 8326 | link_block_group(cache); |
10577 | 8327 | ||
@@ -10598,6 +8348,35 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) | |||
10598 | write_sequnlock(&fs_info->profiles_lock); | 8348 | write_sequnlock(&fs_info->profiles_lock); |
10599 | } | 8349 | } |
10600 | 8350 | ||
8351 | /* | ||
8352 | * Clear incompat bits for the following feature(s): | ||
8353 | * | ||
8354 | * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group | ||
8355 | * in the whole filesystem | ||
8356 | */ | ||
8357 | static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) | ||
8358 | { | ||
8359 | if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) { | ||
8360 | struct list_head *head = &fs_info->space_info; | ||
8361 | struct btrfs_space_info *sinfo; | ||
8362 | |||
8363 | list_for_each_entry_rcu(sinfo, head, list) { | ||
8364 | bool found = false; | ||
8365 | |||
8366 | down_read(&sinfo->groups_sem); | ||
8367 | if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) | ||
8368 | found = true; | ||
8369 | if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) | ||
8370 | found = true; | ||
8371 | up_read(&sinfo->groups_sem); | ||
8372 | |||
8373 | if (found) | ||
8374 | return; | ||
8375 | } | ||
8376 | btrfs_clear_fs_incompat(fs_info, RAID56); | ||
8377 | } | ||
8378 | } | ||
8379 | |||
10601 | int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | 8380 | int btrfs_remove_block_group(struct btrfs_trans_handle *trans, |
10602 | u64 group_start, struct extent_map *em) | 8381 | u64 group_start, struct extent_map *em) |
10603 | { | 8382 | { |
@@ -10744,6 +8523,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
10744 | clear_avail_alloc_bits(fs_info, block_group->flags); | 8523 | clear_avail_alloc_bits(fs_info, block_group->flags); |
10745 | } | 8524 | } |
10746 | up_write(&block_group->space_info->groups_sem); | 8525 | up_write(&block_group->space_info->groups_sem); |
8526 | clear_incompat_bg_bits(fs_info, block_group->flags); | ||
10747 | if (kobj) { | 8527 | if (kobj) { |
10748 | kobject_del(kobj); | 8528 | kobject_del(kobj); |
10749 | kobject_put(kobj); | 8529 | kobject_put(kobj); |
@@ -10853,7 +8633,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
10853 | if (remove_em) { | 8633 | if (remove_em) { |
10854 | struct extent_map_tree *em_tree; | 8634 | struct extent_map_tree *em_tree; |
10855 | 8635 | ||
10856 | em_tree = &fs_info->mapping_tree.map_tree; | 8636 | em_tree = &fs_info->mapping_tree; |
10857 | write_lock(&em_tree->lock); | 8637 | write_lock(&em_tree->lock); |
10858 | remove_extent_mapping(em_tree, em); | 8638 | remove_extent_mapping(em_tree, em); |
10859 | write_unlock(&em_tree->lock); | 8639 | write_unlock(&em_tree->lock); |
@@ -10871,7 +8651,7 @@ struct btrfs_trans_handle * | |||
10871 | btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, | 8651 | btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, |
10872 | const u64 chunk_offset) | 8652 | const u64 chunk_offset) |
10873 | { | 8653 | { |
10874 | struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; | 8654 | struct extent_map_tree *em_tree = &fs_info->mapping_tree; |
10875 | struct extent_map *em; | 8655 | struct extent_map *em; |
10876 | struct map_lookup *map; | 8656 | struct map_lookup *map; |
10877 | unsigned int num_items; | 8657 | unsigned int num_items; |
@@ -11020,7 +8800,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) | |||
11020 | spin_lock(&space_info->lock); | 8800 | spin_lock(&space_info->lock); |
11021 | spin_lock(&block_group->lock); | 8801 | spin_lock(&block_group->lock); |
11022 | 8802 | ||
11023 | update_bytes_pinned(space_info, -block_group->pinned); | 8803 | btrfs_space_info_update_bytes_pinned(fs_info, space_info, |
8804 | -block_group->pinned); | ||
11024 | space_info->bytes_readonly += block_group->pinned; | 8805 | space_info->bytes_readonly += block_group->pinned; |
11025 | percpu_counter_add_batch(&space_info->total_bytes_pinned, | 8806 | percpu_counter_add_batch(&space_info->total_bytes_pinned, |
11026 | -block_group->pinned, | 8807 | -block_group->pinned, |
@@ -11076,43 +8857,6 @@ next: | |||
11076 | spin_unlock(&fs_info->unused_bgs_lock); | 8857 | spin_unlock(&fs_info->unused_bgs_lock); |
11077 | } | 8858 | } |
11078 | 8859 | ||
11079 | int btrfs_init_space_info(struct btrfs_fs_info *fs_info) | ||
11080 | { | ||
11081 | struct btrfs_super_block *disk_super; | ||
11082 | u64 features; | ||
11083 | u64 flags; | ||
11084 | int mixed = 0; | ||
11085 | int ret; | ||
11086 | |||
11087 | disk_super = fs_info->super_copy; | ||
11088 | if (!btrfs_super_root(disk_super)) | ||
11089 | return -EINVAL; | ||
11090 | |||
11091 | features = btrfs_super_incompat_flags(disk_super); | ||
11092 | if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) | ||
11093 | mixed = 1; | ||
11094 | |||
11095 | flags = BTRFS_BLOCK_GROUP_SYSTEM; | ||
11096 | ret = create_space_info(fs_info, flags); | ||
11097 | if (ret) | ||
11098 | goto out; | ||
11099 | |||
11100 | if (mixed) { | ||
11101 | flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; | ||
11102 | ret = create_space_info(fs_info, flags); | ||
11103 | } else { | ||
11104 | flags = BTRFS_BLOCK_GROUP_METADATA; | ||
11105 | ret = create_space_info(fs_info, flags); | ||
11106 | if (ret) | ||
11107 | goto out; | ||
11108 | |||
11109 | flags = BTRFS_BLOCK_GROUP_DATA; | ||
11110 | ret = create_space_info(fs_info, flags); | ||
11111 | } | ||
11112 | out: | ||
11113 | return ret; | ||
11114 | } | ||
11115 | |||
11116 | int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, | 8860 | int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, |
11117 | u64 start, u64 end) | 8861 | u64 start, u64 end) |
11118 | { | 8862 | { |
@@ -11171,12 +8915,17 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) | |||
11171 | find_first_clear_extent_bit(&device->alloc_state, start, | 8915 | find_first_clear_extent_bit(&device->alloc_state, start, |
11172 | &start, &end, | 8916 | &start, &end, |
11173 | CHUNK_TRIMMED | CHUNK_ALLOCATED); | 8917 | CHUNK_TRIMMED | CHUNK_ALLOCATED); |
8918 | |||
8919 | /* Ensure we skip the reserved area in the first 1M */ | ||
8920 | start = max_t(u64, start, SZ_1M); | ||
8921 | |||
11174 | /* | 8922 | /* |
11175 | * If find_first_clear_extent_bit find a range that spans the | 8923 | * If find_first_clear_extent_bit find a range that spans the |
11176 | * end of the device it will set end to -1, in this case it's up | 8924 | * end of the device it will set end to -1, in this case it's up |
11177 | * to the caller to trim the value to the size of the device. | 8925 | * to the caller to trim the value to the size of the device. |
11178 | */ | 8926 | */ |
11179 | end = min(end, device->total_bytes - 1); | 8927 | end = min(end, device->total_bytes - 1); |
8928 | |||
11180 | len = end - start + 1; | 8929 | len = end - start + 1; |
11181 | 8930 | ||
11182 | /* We didn't find any extents */ | 8931 | /* We didn't find any extents */ |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5106008f5e28..1ff438fd5bc2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -359,6 +359,24 @@ do_insert: | |||
359 | return NULL; | 359 | return NULL; |
360 | } | 360 | } |
361 | 361 | ||
362 | /** | ||
363 | * __etree_search - searche @tree for an entry that contains @offset. Such | ||
364 | * entry would have entry->start <= offset && entry->end >= offset. | ||
365 | * | ||
366 | * @tree - the tree to search | ||
367 | * @offset - offset that should fall within an entry in @tree | ||
368 | * @next_ret - pointer to the first entry whose range ends after @offset | ||
369 | * @prev - pointer to the first entry whose range begins before @offset | ||
370 | * @p_ret - pointer where new node should be anchored (used when inserting an | ||
371 | * entry in the tree) | ||
372 | * @parent_ret - points to entry which would have been the parent of the entry, | ||
373 | * containing @offset | ||
374 | * | ||
375 | * This function returns a pointer to the entry that contains @offset byte | ||
376 | * address. If no such entry exists, then NULL is returned and the other | ||
377 | * pointer arguments to the function are filled, otherwise the found entry is | ||
378 | * returned and other pointers are left untouched. | ||
379 | */ | ||
362 | static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, | 380 | static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, |
363 | struct rb_node **next_ret, | 381 | struct rb_node **next_ret, |
364 | struct rb_node **prev_ret, | 382 | struct rb_node **prev_ret, |
@@ -504,9 +522,11 @@ static int insert_state(struct extent_io_tree *tree, | |||
504 | { | 522 | { |
505 | struct rb_node *node; | 523 | struct rb_node *node; |
506 | 524 | ||
507 | if (end < start) | 525 | if (end < start) { |
508 | WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", | 526 | btrfs_err(tree->fs_info, |
509 | end, start); | 527 | "insert state: end < start %llu %llu", end, start); |
528 | WARN_ON(1); | ||
529 | } | ||
510 | state->start = start; | 530 | state->start = start; |
511 | state->end = end; | 531 | state->end = end; |
512 | 532 | ||
@@ -516,7 +536,8 @@ static int insert_state(struct extent_io_tree *tree, | |||
516 | if (node) { | 536 | if (node) { |
517 | struct extent_state *found; | 537 | struct extent_state *found; |
518 | found = rb_entry(node, struct extent_state, rb_node); | 538 | found = rb_entry(node, struct extent_state, rb_node); |
519 | pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", | 539 | btrfs_err(tree->fs_info, |
540 | "found node %llu %llu on insert of %llu %llu", | ||
520 | found->start, found->end, start, end); | 541 | found->start, found->end, start, end); |
521 | return -EEXIST; | 542 | return -EEXIST; |
522 | } | 543 | } |
@@ -1537,8 +1558,8 @@ out: | |||
1537 | } | 1558 | } |
1538 | 1559 | ||
1539 | /** | 1560 | /** |
1540 | * find_first_clear_extent_bit - finds the first range that has @bits not set | 1561 | * find_first_clear_extent_bit - find the first range that has @bits not set. |
1541 | * and that starts after @start | 1562 | * This range could start before @start. |
1542 | * | 1563 | * |
1543 | * @tree - the tree to search | 1564 | * @tree - the tree to search |
1544 | * @start - the offset at/after which the found extent should start | 1565 | * @start - the offset at/after which the found extent should start |
@@ -1578,12 +1599,52 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, | |||
1578 | goto out; | 1599 | goto out; |
1579 | } | 1600 | } |
1580 | } | 1601 | } |
1602 | /* | ||
1603 | * At this point 'node' either contains 'start' or start is | ||
1604 | * before 'node' | ||
1605 | */ | ||
1581 | state = rb_entry(node, struct extent_state, rb_node); | 1606 | state = rb_entry(node, struct extent_state, rb_node); |
1582 | if (in_range(start, state->start, state->end - state->start + 1) && | 1607 | |
1583 | (state->state & bits)) { | 1608 | if (in_range(start, state->start, state->end - state->start + 1)) { |
1584 | start = state->end + 1; | 1609 | if (state->state & bits) { |
1610 | /* | ||
1611 | * |--range with bits sets--| | ||
1612 | * | | ||
1613 | * start | ||
1614 | */ | ||
1615 | start = state->end + 1; | ||
1616 | } else { | ||
1617 | /* | ||
1618 | * 'start' falls within a range that doesn't | ||
1619 | * have the bits set, so take its start as | ||
1620 | * the beginning of the desired range | ||
1621 | * | ||
1622 | * |--range with bits cleared----| | ||
1623 | * | | ||
1624 | * start | ||
1625 | */ | ||
1626 | *start_ret = state->start; | ||
1627 | break; | ||
1628 | } | ||
1585 | } else { | 1629 | } else { |
1586 | *start_ret = start; | 1630 | /* |
1631 | * |---prev range---|---hole/unset---|---node range---| | ||
1632 | * | | ||
1633 | * start | ||
1634 | * | ||
1635 | * or | ||
1636 | * | ||
1637 | * |---hole/unset--||--first node--| | ||
1638 | * 0 | | ||
1639 | * start | ||
1640 | */ | ||
1641 | if (prev) { | ||
1642 | state = rb_entry(prev, struct extent_state, | ||
1643 | rb_node); | ||
1644 | *start_ret = state->end + 1; | ||
1645 | } else { | ||
1646 | *start_ret = 0; | ||
1647 | } | ||
1587 | break; | 1648 | break; |
1588 | } | 1649 | } |
1589 | } | 1650 | } |
@@ -1719,10 +1780,10 @@ static noinline int lock_delalloc_pages(struct inode *inode, | |||
1719 | */ | 1780 | */ |
1720 | EXPORT_FOR_TESTS | 1781 | EXPORT_FOR_TESTS |
1721 | noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, | 1782 | noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, |
1722 | struct extent_io_tree *tree, | ||
1723 | struct page *locked_page, u64 *start, | 1783 | struct page *locked_page, u64 *start, |
1724 | u64 *end) | 1784 | u64 *end) |
1725 | { | 1785 | { |
1786 | struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; | ||
1726 | u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; | 1787 | u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; |
1727 | u64 delalloc_start; | 1788 | u64 delalloc_start; |
1728 | u64 delalloc_end; | 1789 | u64 delalloc_end; |
@@ -2800,12 +2861,11 @@ static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) | |||
2800 | * never fail. We're returning a bio right now but you can call btrfs_io_bio | 2861 | * never fail. We're returning a bio right now but you can call btrfs_io_bio |
2801 | * for the appropriate container_of magic | 2862 | * for the appropriate container_of magic |
2802 | */ | 2863 | */ |
2803 | struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) | 2864 | struct bio *btrfs_bio_alloc(u64 first_byte) |
2804 | { | 2865 | { |
2805 | struct bio *bio; | 2866 | struct bio *bio; |
2806 | 2867 | ||
2807 | bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); | 2868 | bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); |
2808 | bio_set_dev(bio, bdev); | ||
2809 | bio->bi_iter.bi_sector = first_byte >> 9; | 2869 | bio->bi_iter.bi_sector = first_byte >> 9; |
2810 | btrfs_io_bio_init(btrfs_io_bio(bio)); | 2870 | btrfs_io_bio_init(btrfs_io_bio(bio)); |
2811 | return bio; | 2871 | return bio; |
@@ -2916,7 +2976,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, | |||
2916 | } | 2976 | } |
2917 | } | 2977 | } |
2918 | 2978 | ||
2919 | bio = btrfs_bio_alloc(bdev, offset); | 2979 | bio = btrfs_bio_alloc(offset); |
2980 | bio_set_dev(bio, bdev); | ||
2920 | bio_add_page(bio, page, page_size, pg_offset); | 2981 | bio_add_page(bio, page, page_size, pg_offset); |
2921 | bio->bi_end_io = end_io_func; | 2982 | bio->bi_end_io = end_io_func; |
2922 | bio->bi_private = tree; | 2983 | bio->bi_private = tree; |
@@ -3204,21 +3265,10 @@ static inline void contiguous_readpages(struct extent_io_tree *tree, | |||
3204 | unsigned long *bio_flags, | 3265 | unsigned long *bio_flags, |
3205 | u64 *prev_em_start) | 3266 | u64 *prev_em_start) |
3206 | { | 3267 | { |
3207 | struct inode *inode; | 3268 | struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); |
3208 | struct btrfs_ordered_extent *ordered; | ||
3209 | int index; | 3269 | int index; |
3210 | 3270 | ||
3211 | inode = pages[0]->mapping->host; | 3271 | btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); |
3212 | while (1) { | ||
3213 | lock_extent(tree, start, end); | ||
3214 | ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, | ||
3215 | end - start + 1); | ||
3216 | if (!ordered) | ||
3217 | break; | ||
3218 | unlock_extent(tree, start, end); | ||
3219 | btrfs_start_ordered_extent(inode, ordered, 1); | ||
3220 | btrfs_put_ordered_extent(ordered); | ||
3221 | } | ||
3222 | 3272 | ||
3223 | for (index = 0; index < nr_pages; index++) { | 3273 | for (index = 0; index < nr_pages; index++) { |
3224 | __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, | 3274 | __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, |
@@ -3234,22 +3284,12 @@ static int __extent_read_full_page(struct extent_io_tree *tree, | |||
3234 | unsigned long *bio_flags, | 3284 | unsigned long *bio_flags, |
3235 | unsigned int read_flags) | 3285 | unsigned int read_flags) |
3236 | { | 3286 | { |
3237 | struct inode *inode = page->mapping->host; | 3287 | struct btrfs_inode *inode = BTRFS_I(page->mapping->host); |
3238 | struct btrfs_ordered_extent *ordered; | ||
3239 | u64 start = page_offset(page); | 3288 | u64 start = page_offset(page); |
3240 | u64 end = start + PAGE_SIZE - 1; | 3289 | u64 end = start + PAGE_SIZE - 1; |
3241 | int ret; | 3290 | int ret; |
3242 | 3291 | ||
3243 | while (1) { | 3292 | btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); |
3244 | lock_extent(tree, start, end); | ||
3245 | ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, | ||
3246 | PAGE_SIZE); | ||
3247 | if (!ordered) | ||
3248 | break; | ||
3249 | unlock_extent(tree, start, end); | ||
3250 | btrfs_start_ordered_extent(inode, ordered, 1); | ||
3251 | btrfs_put_ordered_extent(ordered); | ||
3252 | } | ||
3253 | 3293 | ||
3254 | ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, | 3294 | ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, |
3255 | bio_flags, read_flags, NULL); | 3295 | bio_flags, read_flags, NULL); |
@@ -3290,7 +3330,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, | |||
3290 | struct page *page, struct writeback_control *wbc, | 3330 | struct page *page, struct writeback_control *wbc, |
3291 | u64 delalloc_start, unsigned long *nr_written) | 3331 | u64 delalloc_start, unsigned long *nr_written) |
3292 | { | 3332 | { |
3293 | struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; | ||
3294 | u64 page_end = delalloc_start + PAGE_SIZE - 1; | 3333 | u64 page_end = delalloc_start + PAGE_SIZE - 1; |
3295 | bool found; | 3334 | bool found; |
3296 | u64 delalloc_to_write = 0; | 3335 | u64 delalloc_to_write = 0; |
@@ -3300,8 +3339,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, | |||
3300 | 3339 | ||
3301 | 3340 | ||
3302 | while (delalloc_end < page_end) { | 3341 | while (delalloc_end < page_end) { |
3303 | found = find_lock_delalloc_range(inode, tree, | 3342 | found = find_lock_delalloc_range(inode, page, |
3304 | page, | ||
3305 | &delalloc_start, | 3343 | &delalloc_start, |
3306 | &delalloc_end); | 3344 | &delalloc_end); |
3307 | if (!found) { | 3345 | if (!found) { |
@@ -3310,7 +3348,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, | |||
3310 | } | 3348 | } |
3311 | ret = btrfs_run_delalloc_range(inode, page, delalloc_start, | 3349 | ret = btrfs_run_delalloc_range(inode, page, delalloc_start, |
3312 | delalloc_end, &page_started, nr_written, wbc); | 3350 | delalloc_end, &page_started, nr_written, wbc); |
3313 | /* File system has been set read-only */ | ||
3314 | if (ret) { | 3351 | if (ret) { |
3315 | SetPageError(page); | 3352 | SetPageError(page); |
3316 | /* | 3353 | /* |
@@ -4542,6 +4579,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | |||
4542 | struct btrfs_path *path; | 4579 | struct btrfs_path *path; |
4543 | struct btrfs_root *root = BTRFS_I(inode)->root; | 4580 | struct btrfs_root *root = BTRFS_I(inode)->root; |
4544 | struct fiemap_cache cache = { 0 }; | 4581 | struct fiemap_cache cache = { 0 }; |
4582 | struct ulist *roots; | ||
4583 | struct ulist *tmp_ulist; | ||
4545 | int end = 0; | 4584 | int end = 0; |
4546 | u64 em_start = 0; | 4585 | u64 em_start = 0; |
4547 | u64 em_len = 0; | 4586 | u64 em_len = 0; |
@@ -4555,6 +4594,13 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | |||
4555 | return -ENOMEM; | 4594 | return -ENOMEM; |
4556 | path->leave_spinning = 1; | 4595 | path->leave_spinning = 1; |
4557 | 4596 | ||
4597 | roots = ulist_alloc(GFP_KERNEL); | ||
4598 | tmp_ulist = ulist_alloc(GFP_KERNEL); | ||
4599 | if (!roots || !tmp_ulist) { | ||
4600 | ret = -ENOMEM; | ||
4601 | goto out_free_ulist; | ||
4602 | } | ||
4603 | |||
4558 | start = round_down(start, btrfs_inode_sectorsize(inode)); | 4604 | start = round_down(start, btrfs_inode_sectorsize(inode)); |
4559 | len = round_up(max, btrfs_inode_sectorsize(inode)) - start; | 4605 | len = round_up(max, btrfs_inode_sectorsize(inode)) - start; |
4560 | 4606 | ||
@@ -4565,8 +4611,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | |||
4565 | ret = btrfs_lookup_file_extent(NULL, root, path, | 4611 | ret = btrfs_lookup_file_extent(NULL, root, path, |
4566 | btrfs_ino(BTRFS_I(inode)), -1, 0); | 4612 | btrfs_ino(BTRFS_I(inode)), -1, 0); |
4567 | if (ret < 0) { | 4613 | if (ret < 0) { |
4568 | btrfs_free_path(path); | 4614 | goto out_free_ulist; |
4569 | return ret; | ||
4570 | } else { | 4615 | } else { |
4571 | WARN_ON(!ret); | 4616 | WARN_ON(!ret); |
4572 | if (ret == 1) | 4617 | if (ret == 1) |
@@ -4675,7 +4720,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | |||
4675 | */ | 4720 | */ |
4676 | ret = btrfs_check_shared(root, | 4721 | ret = btrfs_check_shared(root, |
4677 | btrfs_ino(BTRFS_I(inode)), | 4722 | btrfs_ino(BTRFS_I(inode)), |
4678 | bytenr); | 4723 | bytenr, roots, tmp_ulist); |
4679 | if (ret < 0) | 4724 | if (ret < 0) |
4680 | goto out_free; | 4725 | goto out_free; |
4681 | if (ret) | 4726 | if (ret) |
@@ -4718,9 +4763,13 @@ out_free: | |||
4718 | ret = emit_last_fiemap_cache(fieinfo, &cache); | 4763 | ret = emit_last_fiemap_cache(fieinfo, &cache); |
4719 | free_extent_map(em); | 4764 | free_extent_map(em); |
4720 | out: | 4765 | out: |
4721 | btrfs_free_path(path); | ||
4722 | unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, | 4766 | unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, |
4723 | &cached_state); | 4767 | &cached_state); |
4768 | |||
4769 | out_free_ulist: | ||
4770 | btrfs_free_path(path); | ||
4771 | ulist_free(roots); | ||
4772 | ulist_free(tmp_ulist); | ||
4724 | return ret; | 4773 | return ret; |
4725 | } | 4774 | } |
4726 | 4775 | ||
@@ -4808,7 +4857,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, | |||
4808 | eb->bflags = 0; | 4857 | eb->bflags = 0; |
4809 | rwlock_init(&eb->lock); | 4858 | rwlock_init(&eb->lock); |
4810 | atomic_set(&eb->blocking_readers, 0); | 4859 | atomic_set(&eb->blocking_readers, 0); |
4811 | atomic_set(&eb->blocking_writers, 0); | 4860 | eb->blocking_writers = 0; |
4812 | eb->lock_nested = false; | 4861 | eb->lock_nested = false; |
4813 | init_waitqueue_head(&eb->write_lock_wq); | 4862 | init_waitqueue_head(&eb->write_lock_wq); |
4814 | init_waitqueue_head(&eb->read_lock_wq); | 4863 | init_waitqueue_head(&eb->read_lock_wq); |
@@ -4827,10 +4876,10 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, | |||
4827 | BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); | 4876 | BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); |
4828 | 4877 | ||
4829 | #ifdef CONFIG_BTRFS_DEBUG | 4878 | #ifdef CONFIG_BTRFS_DEBUG |
4830 | atomic_set(&eb->spinning_writers, 0); | 4879 | eb->spinning_writers = 0; |
4831 | atomic_set(&eb->spinning_readers, 0); | 4880 | atomic_set(&eb->spinning_readers, 0); |
4832 | atomic_set(&eb->read_locks, 0); | 4881 | atomic_set(&eb->read_locks, 0); |
4833 | atomic_set(&eb->write_locks, 0); | 4882 | eb->write_locks = 0; |
4834 | #endif | 4883 | #endif |
4835 | 4884 | ||
4836 | return eb; | 4885 | return eb; |
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index aa18a16a6ed7..401423b16976 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h | |||
@@ -167,7 +167,7 @@ struct extent_buffer { | |||
167 | struct rcu_head rcu_head; | 167 | struct rcu_head rcu_head; |
168 | pid_t lock_owner; | 168 | pid_t lock_owner; |
169 | 169 | ||
170 | atomic_t blocking_writers; | 170 | int blocking_writers; |
171 | atomic_t blocking_readers; | 171 | atomic_t blocking_readers; |
172 | bool lock_nested; | 172 | bool lock_nested; |
173 | /* >= 0 if eb belongs to a log tree, -1 otherwise */ | 173 | /* >= 0 if eb belongs to a log tree, -1 otherwise */ |
@@ -187,10 +187,10 @@ struct extent_buffer { | |||
187 | wait_queue_head_t read_lock_wq; | 187 | wait_queue_head_t read_lock_wq; |
188 | struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; | 188 | struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; |
189 | #ifdef CONFIG_BTRFS_DEBUG | 189 | #ifdef CONFIG_BTRFS_DEBUG |
190 | atomic_t spinning_writers; | 190 | int spinning_writers; |
191 | atomic_t spinning_readers; | 191 | atomic_t spinning_readers; |
192 | atomic_t read_locks; | 192 | atomic_t read_locks; |
193 | atomic_t write_locks; | 193 | int write_locks; |
194 | struct list_head leak_list; | 194 | struct list_head leak_list; |
195 | #endif | 195 | #endif |
196 | }; | 196 | }; |
@@ -497,7 +497,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, | |||
497 | u64 delalloc_end, struct page *locked_page, | 497 | u64 delalloc_end, struct page *locked_page, |
498 | unsigned bits_to_clear, | 498 | unsigned bits_to_clear, |
499 | unsigned long page_ops); | 499 | unsigned long page_ops); |
500 | struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte); | 500 | struct bio *btrfs_bio_alloc(u64 first_byte); |
501 | struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); | 501 | struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); |
502 | struct bio *btrfs_bio_clone(struct bio *bio); | 502 | struct bio *btrfs_bio_clone(struct bio *bio); |
503 | struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); | 503 | struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); |
@@ -549,7 +549,7 @@ int free_io_failure(struct extent_io_tree *failure_tree, | |||
549 | struct extent_io_tree *io_tree, | 549 | struct extent_io_tree *io_tree, |
550 | struct io_failure_record *rec); | 550 | struct io_failure_record *rec); |
551 | #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS | 551 | #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS |
552 | bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, | 552 | bool find_lock_delalloc_range(struct inode *inode, |
553 | struct page *locked_page, u64 *start, | 553 | struct page *locked_page, u64 *start, |
554 | u64 *end); | 554 | u64 *end); |
555 | #endif | 555 | #endif |
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d431ea8198e4..1a599f50837b 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/pagemap.h> | 8 | #include <linux/pagemap.h> |
9 | #include <linux/highmem.h> | 9 | #include <linux/highmem.h> |
10 | #include <linux/sched/mm.h> | 10 | #include <linux/sched/mm.h> |
11 | #include <crypto/hash.h> | ||
11 | #include "ctree.h" | 12 | #include "ctree.h" |
12 | #include "disk-io.h" | 13 | #include "disk-io.h" |
13 | #include "transaction.h" | 14 | #include "transaction.h" |
@@ -22,9 +23,13 @@ | |||
22 | #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ | 23 | #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ |
23 | PAGE_SIZE)) | 24 | PAGE_SIZE)) |
24 | 25 | ||
25 | #define MAX_ORDERED_SUM_BYTES(fs_info) ((PAGE_SIZE - \ | 26 | static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, |
26 | sizeof(struct btrfs_ordered_sum)) / \ | 27 | u16 csum_size) |
27 | sizeof(u32) * (fs_info)->sectorsize) | 28 | { |
29 | u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size; | ||
30 | |||
31 | return ncsums * fs_info->sectorsize; | ||
32 | } | ||
28 | 33 | ||
29 | int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, | 34 | int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, |
30 | struct btrfs_root *root, | 35 | struct btrfs_root *root, |
@@ -144,7 +149,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, | |||
144 | } | 149 | } |
145 | 150 | ||
146 | static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, | 151 | static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, |
147 | u64 logical_offset, u32 *dst, int dio) | 152 | u64 logical_offset, u8 *dst, int dio) |
148 | { | 153 | { |
149 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); | 154 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); |
150 | struct bio_vec bvec; | 155 | struct bio_vec bvec; |
@@ -182,7 +187,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio | |||
182 | } | 187 | } |
183 | csum = btrfs_bio->csum; | 188 | csum = btrfs_bio->csum; |
184 | } else { | 189 | } else { |
185 | csum = (u8 *)dst; | 190 | csum = dst; |
186 | } | 191 | } |
187 | 192 | ||
188 | if (bio->bi_iter.bi_size > PAGE_SIZE * 8) | 193 | if (bio->bi_iter.bi_size > PAGE_SIZE * 8) |
@@ -211,7 +216,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio | |||
211 | if (!dio) | 216 | if (!dio) |
212 | offset = page_offset(bvec.bv_page) + bvec.bv_offset; | 217 | offset = page_offset(bvec.bv_page) + bvec.bv_offset; |
213 | count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, | 218 | count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, |
214 | (u32 *)csum, nblocks); | 219 | csum, nblocks); |
215 | if (count) | 220 | if (count) |
216 | goto found; | 221 | goto found; |
217 | 222 | ||
@@ -283,7 +288,8 @@ next: | |||
283 | return 0; | 288 | return 0; |
284 | } | 289 | } |
285 | 290 | ||
286 | blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) | 291 | blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, |
292 | u8 *dst) | ||
287 | { | 293 | { |
288 | return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); | 294 | return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); |
289 | } | 295 | } |
@@ -374,7 +380,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, | |||
374 | struct btrfs_csum_item); | 380 | struct btrfs_csum_item); |
375 | while (start < csum_end) { | 381 | while (start < csum_end) { |
376 | size = min_t(size_t, csum_end - start, | 382 | size = min_t(size_t, csum_end - start, |
377 | MAX_ORDERED_SUM_BYTES(fs_info)); | 383 | max_ordered_sum_bytes(fs_info, csum_size)); |
378 | sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), | 384 | sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), |
379 | GFP_NOFS); | 385 | GFP_NOFS); |
380 | if (!sums) { | 386 | if (!sums) { |
@@ -427,6 +433,7 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, | |||
427 | u64 file_start, int contig) | 433 | u64 file_start, int contig) |
428 | { | 434 | { |
429 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); | 435 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); |
436 | SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); | ||
430 | struct btrfs_ordered_sum *sums; | 437 | struct btrfs_ordered_sum *sums; |
431 | struct btrfs_ordered_extent *ordered = NULL; | 438 | struct btrfs_ordered_extent *ordered = NULL; |
432 | char *data; | 439 | char *data; |
@@ -439,6 +446,7 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, | |||
439 | int i; | 446 | int i; |
440 | u64 offset; | 447 | u64 offset; |
441 | unsigned nofs_flag; | 448 | unsigned nofs_flag; |
449 | const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); | ||
442 | 450 | ||
443 | nofs_flag = memalloc_nofs_save(); | 451 | nofs_flag = memalloc_nofs_save(); |
444 | sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), | 452 | sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), |
@@ -459,6 +467,8 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, | |||
459 | sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; | 467 | sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; |
460 | index = 0; | 468 | index = 0; |
461 | 469 | ||
470 | shash->tfm = fs_info->csum_shash; | ||
471 | |||
462 | bio_for_each_segment(bvec, bio, iter) { | 472 | bio_for_each_segment(bvec, bio, iter) { |
463 | if (!contig) | 473 | if (!contig) |
464 | offset = page_offset(bvec.bv_page) + bvec.bv_offset; | 474 | offset = page_offset(bvec.bv_page) + bvec.bv_offset; |
@@ -498,17 +508,14 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, | |||
498 | index = 0; | 508 | index = 0; |
499 | } | 509 | } |
500 | 510 | ||
501 | sums->sums[index] = ~(u32)0; | 511 | crypto_shash_init(shash); |
502 | data = kmap_atomic(bvec.bv_page); | 512 | data = kmap_atomic(bvec.bv_page); |
503 | sums->sums[index] | 513 | crypto_shash_update(shash, data + bvec.bv_offset |
504 | = btrfs_csum_data(data + bvec.bv_offset | 514 | + (i * fs_info->sectorsize), |
505 | + (i * fs_info->sectorsize), | 515 | fs_info->sectorsize); |
506 | sums->sums[index], | ||
507 | fs_info->sectorsize); | ||
508 | kunmap_atomic(data); | 516 | kunmap_atomic(data); |
509 | btrfs_csum_final(sums->sums[index], | 517 | crypto_shash_final(shash, (char *)(sums->sums + index)); |
510 | (char *)(sums->sums + index)); | 518 | index += csum_size; |
511 | index++; | ||
512 | offset += fs_info->sectorsize; | 519 | offset += fs_info->sectorsize; |
513 | this_sum_bytes += fs_info->sectorsize; | 520 | this_sum_bytes += fs_info->sectorsize; |
514 | total_bytes += fs_info->sectorsize; | 521 | total_bytes += fs_info->sectorsize; |
@@ -904,9 +911,9 @@ found: | |||
904 | write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, | 911 | write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, |
905 | ins_size); | 912 | ins_size); |
906 | 913 | ||
914 | index += ins_size; | ||
907 | ins_size /= csum_size; | 915 | ins_size /= csum_size; |
908 | total_bytes += ins_size * fs_info->sectorsize; | 916 | total_bytes += ins_size * fs_info->sectorsize; |
909 | index += ins_size; | ||
910 | 917 | ||
911 | btrfs_mark_buffer_dirty(path->nodes[0]); | 918 | btrfs_mark_buffer_dirty(path->nodes[0]); |
912 | if (total_bytes < sums->len) { | 919 | if (total_bytes < sums->len) { |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 89f5be2bfb43..58a18ed11546 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include "volumes.h" | 26 | #include "volumes.h" |
27 | #include "qgroup.h" | 27 | #include "qgroup.h" |
28 | #include "compression.h" | 28 | #include "compression.h" |
29 | #include "delalloc-space.h" | ||
29 | 30 | ||
30 | static struct kmem_cache *btrfs_inode_defrag_cachep; | 31 | static struct kmem_cache *btrfs_inode_defrag_cachep; |
31 | /* | 32 | /* |
@@ -1550,30 +1551,20 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, | |||
1550 | { | 1551 | { |
1551 | struct btrfs_fs_info *fs_info = inode->root->fs_info; | 1552 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
1552 | struct btrfs_root *root = inode->root; | 1553 | struct btrfs_root *root = inode->root; |
1553 | struct btrfs_ordered_extent *ordered; | ||
1554 | u64 lockstart, lockend; | 1554 | u64 lockstart, lockend; |
1555 | u64 num_bytes; | 1555 | u64 num_bytes; |
1556 | int ret; | 1556 | int ret; |
1557 | 1557 | ||
1558 | ret = btrfs_start_write_no_snapshotting(root); | 1558 | ret = btrfs_start_write_no_snapshotting(root); |
1559 | if (!ret) | 1559 | if (!ret) |
1560 | return -ENOSPC; | 1560 | return -EAGAIN; |
1561 | 1561 | ||
1562 | lockstart = round_down(pos, fs_info->sectorsize); | 1562 | lockstart = round_down(pos, fs_info->sectorsize); |
1563 | lockend = round_up(pos + *write_bytes, | 1563 | lockend = round_up(pos + *write_bytes, |
1564 | fs_info->sectorsize) - 1; | 1564 | fs_info->sectorsize) - 1; |
1565 | 1565 | ||
1566 | while (1) { | 1566 | btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart, |
1567 | lock_extent(&inode->io_tree, lockstart, lockend); | 1567 | lockend, NULL); |
1568 | ordered = btrfs_lookup_ordered_range(inode, lockstart, | ||
1569 | lockend - lockstart + 1); | ||
1570 | if (!ordered) { | ||
1571 | break; | ||
1572 | } | ||
1573 | unlock_extent(&inode->io_tree, lockstart, lockend); | ||
1574 | btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); | ||
1575 | btrfs_put_ordered_extent(ordered); | ||
1576 | } | ||
1577 | 1568 | ||
1578 | num_bytes = lockend - lockstart + 1; | 1569 | num_bytes = lockend - lockstart + 1; |
1579 | ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, | 1570 | ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, |
@@ -2721,6 +2712,11 @@ out_only_mutex: | |||
2721 | * for detecting, at fsync time, if the inode isn't yet in the | 2712 | * for detecting, at fsync time, if the inode isn't yet in the |
2722 | * log tree or it's there but not up to date. | 2713 | * log tree or it's there but not up to date. |
2723 | */ | 2714 | */ |
2715 | struct timespec64 now = current_time(inode); | ||
2716 | |||
2717 | inode_inc_iversion(inode); | ||
2718 | inode->i_mtime = now; | ||
2719 | inode->i_ctime = now; | ||
2724 | trans = btrfs_start_transaction(root, 1); | 2720 | trans = btrfs_start_transaction(root, 1); |
2725 | if (IS_ERR(trans)) { | 2721 | if (IS_ERR(trans)) { |
2726 | err = PTR_ERR(trans); | 2722 | err = PTR_ERR(trans); |
@@ -2801,9 +2797,9 @@ static int btrfs_fallocate_update_isize(struct inode *inode, | |||
2801 | } | 2797 | } |
2802 | 2798 | ||
2803 | enum { | 2799 | enum { |
2804 | RANGE_BOUNDARY_WRITTEN_EXTENT = 0, | 2800 | RANGE_BOUNDARY_WRITTEN_EXTENT, |
2805 | RANGE_BOUNDARY_PREALLOC_EXTENT = 1, | 2801 | RANGE_BOUNDARY_PREALLOC_EXTENT, |
2806 | RANGE_BOUNDARY_HOLE = 2, | 2802 | RANGE_BOUNDARY_HOLE, |
2807 | }; | 2803 | }; |
2808 | 2804 | ||
2809 | static int btrfs_zero_range_check_range_boundary(struct inode *inode, | 2805 | static int btrfs_zero_range_check_range_boundary(struct inode *inode, |
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f74dc259307b..062be9dde4c6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c | |||
@@ -18,6 +18,8 @@ | |||
18 | #include "extent_io.h" | 18 | #include "extent_io.h" |
19 | #include "inode-map.h" | 19 | #include "inode-map.h" |
20 | #include "volumes.h" | 20 | #include "volumes.h" |
21 | #include "space-info.h" | ||
22 | #include "delalloc-space.h" | ||
21 | 23 | ||
22 | #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) | 24 | #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) |
23 | #define MAX_CACHE_BYTES_PER_GIG SZ_32K | 25 | #define MAX_CACHE_BYTES_PER_GIG SZ_32K |
@@ -465,9 +467,8 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) | |||
465 | if (index == 0) | 467 | if (index == 0) |
466 | offset = sizeof(u32) * io_ctl->num_pages; | 468 | offset = sizeof(u32) * io_ctl->num_pages; |
467 | 469 | ||
468 | crc = btrfs_csum_data(io_ctl->orig + offset, crc, | 470 | crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); |
469 | PAGE_SIZE - offset); | 471 | btrfs_crc32c_final(crc, (u8 *)&crc); |
470 | btrfs_csum_final(crc, (u8 *)&crc); | ||
471 | io_ctl_unmap_page(io_ctl); | 472 | io_ctl_unmap_page(io_ctl); |
472 | tmp = page_address(io_ctl->pages[0]); | 473 | tmp = page_address(io_ctl->pages[0]); |
473 | tmp += index; | 474 | tmp += index; |
@@ -493,9 +494,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) | |||
493 | val = *tmp; | 494 | val = *tmp; |
494 | 495 | ||
495 | io_ctl_map_page(io_ctl, 0); | 496 | io_ctl_map_page(io_ctl, 0); |
496 | crc = btrfs_csum_data(io_ctl->orig + offset, crc, | 497 | crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); |
497 | PAGE_SIZE - offset); | 498 | btrfs_crc32c_final(crc, (u8 *)&crc); |
498 | btrfs_csum_final(crc, (u8 *)&crc); | ||
499 | if (val != crc) { | 499 | if (val != crc) { |
500 | btrfs_err_rl(io_ctl->fs_info, | 500 | btrfs_err_rl(io_ctl->fs_info, |
501 | "csum mismatch on free space cache"); | 501 | "csum mismatch on free space cache"); |
@@ -3166,8 +3166,8 @@ static int do_trimming(struct btrfs_block_group_cache *block_group, | |||
3166 | space_info->bytes_readonly += reserved_bytes; | 3166 | space_info->bytes_readonly += reserved_bytes; |
3167 | block_group->reserved -= reserved_bytes; | 3167 | block_group->reserved -= reserved_bytes; |
3168 | space_info->bytes_reserved -= reserved_bytes; | 3168 | space_info->bytes_reserved -= reserved_bytes; |
3169 | spin_unlock(&space_info->lock); | ||
3170 | spin_unlock(&block_group->lock); | 3169 | spin_unlock(&block_group->lock); |
3170 | spin_unlock(&space_info->lock); | ||
3171 | } | 3171 | } |
3172 | 3172 | ||
3173 | return ret; | 3173 | return ret; |
@@ -3358,7 +3358,7 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) | |||
3358 | 3358 | ||
3359 | if (cleanup) { | 3359 | if (cleanup) { |
3360 | mutex_lock(&fs_info->chunk_mutex); | 3360 | mutex_lock(&fs_info->chunk_mutex); |
3361 | em_tree = &fs_info->mapping_tree.map_tree; | 3361 | em_tree = &fs_info->mapping_tree; |
3362 | write_lock(&em_tree->lock); | 3362 | write_lock(&em_tree->lock); |
3363 | em = lookup_extent_mapping(em_tree, block_group->key.objectid, | 3363 | em = lookup_extent_mapping(em_tree, block_group->key.objectid, |
3364 | 1); | 3364 | 1); |
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index ffca2abf13d0..2e8bb402050b 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include "free-space-cache.h" | 11 | #include "free-space-cache.h" |
12 | #include "inode-map.h" | 12 | #include "inode-map.h" |
13 | #include "transaction.h" | 13 | #include "transaction.h" |
14 | #include "delalloc-space.h" | ||
14 | 15 | ||
15 | static int caching_kthread(void *data) | 16 | static int caching_kthread(void *data) |
16 | { | 17 | { |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a2aabdb85226..1af069a9a0c7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include "props.h" | 47 | #include "props.h" |
48 | #include "qgroup.h" | 48 | #include "qgroup.h" |
49 | #include "dedupe.h" | 49 | #include "dedupe.h" |
50 | #include "delalloc-space.h" | ||
50 | 51 | ||
51 | struct btrfs_iget_args { | 52 | struct btrfs_iget_args { |
52 | struct btrfs_key *location; | 53 | struct btrfs_key *location; |
@@ -1932,17 +1933,19 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, | |||
1932 | u64 length = 0; | 1933 | u64 length = 0; |
1933 | u64 map_length; | 1934 | u64 map_length; |
1934 | int ret; | 1935 | int ret; |
1936 | struct btrfs_io_geometry geom; | ||
1935 | 1937 | ||
1936 | if (bio_flags & EXTENT_BIO_COMPRESSED) | 1938 | if (bio_flags & EXTENT_BIO_COMPRESSED) |
1937 | return 0; | 1939 | return 0; |
1938 | 1940 | ||
1939 | length = bio->bi_iter.bi_size; | 1941 | length = bio->bi_iter.bi_size; |
1940 | map_length = length; | 1942 | map_length = length; |
1941 | ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, | 1943 | ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length, |
1942 | NULL, 0); | 1944 | &geom); |
1943 | if (ret < 0) | 1945 | if (ret < 0) |
1944 | return ret; | 1946 | return ret; |
1945 | if (map_length < length + size) | 1947 | |
1948 | if (geom.len < length + size) | ||
1946 | return 1; | 1949 | return 1; |
1947 | return 0; | 1950 | return 0; |
1948 | } | 1951 | } |
@@ -3203,16 +3206,23 @@ static int __readpage_endio_check(struct inode *inode, | |||
3203 | int icsum, struct page *page, | 3206 | int icsum, struct page *page, |
3204 | int pgoff, u64 start, size_t len) | 3207 | int pgoff, u64 start, size_t len) |
3205 | { | 3208 | { |
3209 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); | ||
3210 | SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); | ||
3206 | char *kaddr; | 3211 | char *kaddr; |
3207 | u32 csum_expected; | 3212 | u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); |
3208 | u32 csum = ~(u32)0; | 3213 | u8 *csum_expected; |
3214 | u8 csum[BTRFS_CSUM_SIZE]; | ||
3209 | 3215 | ||
3210 | csum_expected = *(((u32 *)io_bio->csum) + icsum); | 3216 | csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size; |
3211 | 3217 | ||
3212 | kaddr = kmap_atomic(page); | 3218 | kaddr = kmap_atomic(page); |
3213 | csum = btrfs_csum_data(kaddr + pgoff, csum, len); | 3219 | shash->tfm = fs_info->csum_shash; |
3214 | btrfs_csum_final(csum, (u8 *)&csum); | 3220 | |
3215 | if (csum != csum_expected) | 3221 | crypto_shash_init(shash); |
3222 | crypto_shash_update(shash, kaddr + pgoff, len); | ||
3223 | crypto_shash_final(shash, csum); | ||
3224 | |||
3225 | if (memcmp(csum, csum_expected, csum_size)) | ||
3216 | goto zeroit; | 3226 | goto zeroit; |
3217 | 3227 | ||
3218 | kunmap_atomic(kaddr); | 3228 | kunmap_atomic(kaddr); |
@@ -3286,6 +3296,28 @@ void btrfs_add_delayed_iput(struct inode *inode) | |||
3286 | wake_up_process(fs_info->cleaner_kthread); | 3296 | wake_up_process(fs_info->cleaner_kthread); |
3287 | } | 3297 | } |
3288 | 3298 | ||
3299 | static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, | ||
3300 | struct btrfs_inode *inode) | ||
3301 | { | ||
3302 | list_del_init(&inode->delayed_iput); | ||
3303 | spin_unlock(&fs_info->delayed_iput_lock); | ||
3304 | iput(&inode->vfs_inode); | ||
3305 | if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) | ||
3306 | wake_up(&fs_info->delayed_iputs_wait); | ||
3307 | spin_lock(&fs_info->delayed_iput_lock); | ||
3308 | } | ||
3309 | |||
3310 | static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, | ||
3311 | struct btrfs_inode *inode) | ||
3312 | { | ||
3313 | if (!list_empty(&inode->delayed_iput)) { | ||
3314 | spin_lock(&fs_info->delayed_iput_lock); | ||
3315 | if (!list_empty(&inode->delayed_iput)) | ||
3316 | run_delayed_iput_locked(fs_info, inode); | ||
3317 | spin_unlock(&fs_info->delayed_iput_lock); | ||
3318 | } | ||
3319 | } | ||
3320 | |||
3289 | void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) | 3321 | void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) |
3290 | { | 3322 | { |
3291 | 3323 | ||
@@ -3295,12 +3327,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) | |||
3295 | 3327 | ||
3296 | inode = list_first_entry(&fs_info->delayed_iputs, | 3328 | inode = list_first_entry(&fs_info->delayed_iputs, |
3297 | struct btrfs_inode, delayed_iput); | 3329 | struct btrfs_inode, delayed_iput); |
3298 | list_del_init(&inode->delayed_iput); | 3330 | run_delayed_iput_locked(fs_info, inode); |
3299 | spin_unlock(&fs_info->delayed_iput_lock); | ||
3300 | iput(&inode->vfs_inode); | ||
3301 | if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) | ||
3302 | wake_up(&fs_info->delayed_iputs_wait); | ||
3303 | spin_lock(&fs_info->delayed_iput_lock); | ||
3304 | } | 3331 | } |
3305 | spin_unlock(&fs_info->delayed_iput_lock); | 3332 | spin_unlock(&fs_info->delayed_iput_lock); |
3306 | } | 3333 | } |
@@ -3935,9 +3962,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, | |||
3935 | struct btrfs_fs_info *fs_info = root->fs_info; | 3962 | struct btrfs_fs_info *fs_info = root->fs_info; |
3936 | struct btrfs_path *path; | 3963 | struct btrfs_path *path; |
3937 | int ret = 0; | 3964 | int ret = 0; |
3938 | struct extent_buffer *leaf; | ||
3939 | struct btrfs_dir_item *di; | 3965 | struct btrfs_dir_item *di; |
3940 | struct btrfs_key key; | ||
3941 | u64 index; | 3966 | u64 index; |
3942 | u64 ino = btrfs_ino(inode); | 3967 | u64 ino = btrfs_ino(inode); |
3943 | u64 dir_ino = btrfs_ino(dir); | 3968 | u64 dir_ino = btrfs_ino(dir); |
@@ -3955,8 +3980,6 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, | |||
3955 | ret = di ? PTR_ERR(di) : -ENOENT; | 3980 | ret = di ? PTR_ERR(di) : -ENOENT; |
3956 | goto err; | 3981 | goto err; |
3957 | } | 3982 | } |
3958 | leaf = path->nodes[0]; | ||
3959 | btrfs_dir_item_key_to_cpu(leaf, di, &key); | ||
3960 | ret = btrfs_delete_one_dir_name(trans, root, path, di); | 3983 | ret = btrfs_delete_one_dir_name(trans, root, path, di); |
3961 | if (ret) | 3984 | if (ret) |
3962 | goto err; | 3985 | goto err; |
@@ -4009,6 +4032,17 @@ skip_backref: | |||
4009 | ret = 0; | 4032 | ret = 0; |
4010 | else if (ret) | 4033 | else if (ret) |
4011 | btrfs_abort_transaction(trans, ret); | 4034 | btrfs_abort_transaction(trans, ret); |
4035 | |||
4036 | /* | ||
4037 | * If we have a pending delayed iput we could end up with the final iput | ||
4038 | * being run in btrfs-cleaner context. If we have enough of these built | ||
4039 | * up we can end up burning a lot of time in btrfs-cleaner without any | ||
4040 | * way to throttle the unlinks. Since we're currently holding a ref on | ||
4041 | * the inode we can run the delayed iput here without any issues as the | ||
4042 | * final iput won't be done until after we drop the ref we're currently | ||
4043 | * holding. | ||
4044 | */ | ||
4045 | btrfs_run_delayed_iput(fs_info, inode); | ||
4012 | err: | 4046 | err: |
4013 | btrfs_free_path(path); | 4047 | btrfs_free_path(path); |
4014 | if (ret) | 4048 | if (ret) |
@@ -5008,21 +5042,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
5008 | if (size <= hole_start) | 5042 | if (size <= hole_start) |
5009 | return 0; | 5043 | return 0; |
5010 | 5044 | ||
5011 | while (1) { | 5045 | btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start, |
5012 | struct btrfs_ordered_extent *ordered; | 5046 | block_end - 1, &cached_state); |
5013 | |||
5014 | lock_extent_bits(io_tree, hole_start, block_end - 1, | ||
5015 | &cached_state); | ||
5016 | ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start, | ||
5017 | block_end - hole_start); | ||
5018 | if (!ordered) | ||
5019 | break; | ||
5020 | unlock_extent_cached(io_tree, hole_start, block_end - 1, | ||
5021 | &cached_state); | ||
5022 | btrfs_start_ordered_extent(inode, ordered, 1); | ||
5023 | btrfs_put_ordered_extent(ordered); | ||
5024 | } | ||
5025 | |||
5026 | cur_offset = hole_start; | 5047 | cur_offset = hole_start; |
5027 | while (1) { | 5048 | while (1) { |
5028 | em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, | 5049 | em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, |
@@ -8318,22 +8339,21 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) | |||
8318 | struct bio *orig_bio = dip->orig_bio; | 8339 | struct bio *orig_bio = dip->orig_bio; |
8319 | u64 start_sector = orig_bio->bi_iter.bi_sector; | 8340 | u64 start_sector = orig_bio->bi_iter.bi_sector; |
8320 | u64 file_offset = dip->logical_offset; | 8341 | u64 file_offset = dip->logical_offset; |
8321 | u64 map_length; | ||
8322 | int async_submit = 0; | 8342 | int async_submit = 0; |
8323 | u64 submit_len; | 8343 | u64 submit_len; |
8324 | int clone_offset = 0; | 8344 | int clone_offset = 0; |
8325 | int clone_len; | 8345 | int clone_len; |
8326 | int ret; | 8346 | int ret; |
8327 | blk_status_t status; | 8347 | blk_status_t status; |
8348 | struct btrfs_io_geometry geom; | ||
8328 | 8349 | ||
8329 | map_length = orig_bio->bi_iter.bi_size; | 8350 | submit_len = orig_bio->bi_iter.bi_size; |
8330 | submit_len = map_length; | 8351 | ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), |
8331 | ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, | 8352 | start_sector << 9, submit_len, &geom); |
8332 | &map_length, NULL, 0); | ||
8333 | if (ret) | 8353 | if (ret) |
8334 | return -EIO; | 8354 | return -EIO; |
8335 | 8355 | ||
8336 | if (map_length >= submit_len) { | 8356 | if (geom.len >= submit_len) { |
8337 | bio = orig_bio; | 8357 | bio = orig_bio; |
8338 | dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; | 8358 | dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; |
8339 | goto submit; | 8359 | goto submit; |
@@ -8346,10 +8366,10 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) | |||
8346 | async_submit = 1; | 8366 | async_submit = 1; |
8347 | 8367 | ||
8348 | /* bio split */ | 8368 | /* bio split */ |
8349 | ASSERT(map_length <= INT_MAX); | 8369 | ASSERT(geom.len <= INT_MAX); |
8350 | atomic_inc(&dip->pending_bios); | 8370 | atomic_inc(&dip->pending_bios); |
8351 | do { | 8371 | do { |
8352 | clone_len = min_t(int, submit_len, map_length); | 8372 | clone_len = min_t(int, submit_len, geom.len); |
8353 | 8373 | ||
8354 | /* | 8374 | /* |
8355 | * This will never fail as it's passing GPF_NOFS and | 8375 | * This will never fail as it's passing GPF_NOFS and |
@@ -8386,9 +8406,8 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) | |||
8386 | start_sector += clone_len >> 9; | 8406 | start_sector += clone_len >> 9; |
8387 | file_offset += clone_len; | 8407 | file_offset += clone_len; |
8388 | 8408 | ||
8389 | map_length = submit_len; | 8409 | ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), |
8390 | ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), | 8410 | start_sector << 9, submit_len, &geom); |
8391 | start_sector << 9, &map_length, NULL, 0); | ||
8392 | if (ret) | 8411 | if (ret) |
8393 | goto out_err; | 8412 | goto out_err; |
8394 | } while (submit_len > 0); | 8413 | } while (submit_len > 0); |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cfeff1b8dce0..818f7ec8bb0e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c | |||
@@ -43,6 +43,8 @@ | |||
43 | #include "qgroup.h" | 43 | #include "qgroup.h" |
44 | #include "tree-log.h" | 44 | #include "tree-log.h" |
45 | #include "compression.h" | 45 | #include "compression.h" |
46 | #include "space-info.h" | ||
47 | #include "delalloc-space.h" | ||
46 | 48 | ||
47 | #ifdef CONFIG_64BIT | 49 | #ifdef CONFIG_64BIT |
48 | /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI | 50 | /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI |
@@ -3993,6 +3995,27 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, | |||
3993 | if (!same_inode) | 3995 | if (!same_inode) |
3994 | inode_dio_wait(inode_out); | 3996 | inode_dio_wait(inode_out); |
3995 | 3997 | ||
3998 | /* | ||
3999 | * Workaround to make sure NOCOW buffered write reach disk as NOCOW. | ||
4000 | * | ||
4001 | * Btrfs' back references do not have a block level granularity, they | ||
4002 | * work at the whole extent level. | ||
4003 | * NOCOW buffered write without data space reserved may not be able | ||
4004 | * to fall back to CoW due to lack of data space, thus could cause | ||
4005 | * data loss. | ||
4006 | * | ||
4007 | * Here we take a shortcut by flushing the whole inode, so that all | ||
4008 | * nocow write should reach disk as nocow before we increase the | ||
4009 | * reference of the extent. We could do better by only flushing NOCOW | ||
4010 | * data, but that needs extra accounting. | ||
4011 | * | ||
4012 | * Also we don't need to check ASYNC_EXTENT, as async extent will be | ||
4013 | * CoWed anyway, not affecting nocow part. | ||
4014 | */ | ||
4015 | ret = filemap_flush(inode_in->i_mapping); | ||
4016 | if (ret < 0) | ||
4017 | return ret; | ||
4018 | |||
3996 | ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), | 4019 | ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), |
3997 | wb_len); | 4020 | wb_len); |
3998 | if (ret < 0) | 4021 | if (ret < 0) |
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 2f6c3c7851ed..98fccce4208c 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c | |||
@@ -15,19 +15,19 @@ | |||
15 | #ifdef CONFIG_BTRFS_DEBUG | 15 | #ifdef CONFIG_BTRFS_DEBUG |
16 | static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) | 16 | static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) |
17 | { | 17 | { |
18 | WARN_ON(atomic_read(&eb->spinning_writers)); | 18 | WARN_ON(eb->spinning_writers); |
19 | atomic_inc(&eb->spinning_writers); | 19 | eb->spinning_writers++; |
20 | } | 20 | } |
21 | 21 | ||
22 | static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) | 22 | static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) |
23 | { | 23 | { |
24 | WARN_ON(atomic_read(&eb->spinning_writers) != 1); | 24 | WARN_ON(eb->spinning_writers != 1); |
25 | atomic_dec(&eb->spinning_writers); | 25 | eb->spinning_writers--; |
26 | } | 26 | } |
27 | 27 | ||
28 | static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) | 28 | static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) |
29 | { | 29 | { |
30 | WARN_ON(atomic_read(&eb->spinning_writers)); | 30 | WARN_ON(eb->spinning_writers); |
31 | } | 31 | } |
32 | 32 | ||
33 | static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) | 33 | static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) |
@@ -58,17 +58,17 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) | |||
58 | 58 | ||
59 | static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) | 59 | static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) |
60 | { | 60 | { |
61 | atomic_inc(&eb->write_locks); | 61 | eb->write_locks++; |
62 | } | 62 | } |
63 | 63 | ||
64 | static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) | 64 | static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) |
65 | { | 65 | { |
66 | atomic_dec(&eb->write_locks); | 66 | eb->write_locks--; |
67 | } | 67 | } |
68 | 68 | ||
69 | void btrfs_assert_tree_locked(struct extent_buffer *eb) | 69 | void btrfs_assert_tree_locked(struct extent_buffer *eb) |
70 | { | 70 | { |
71 | BUG_ON(!atomic_read(&eb->write_locks)); | 71 | BUG_ON(!eb->write_locks); |
72 | } | 72 | } |
73 | 73 | ||
74 | #else | 74 | #else |
@@ -111,10 +111,10 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) | |||
111 | */ | 111 | */ |
112 | if (eb->lock_nested && current->pid == eb->lock_owner) | 112 | if (eb->lock_nested && current->pid == eb->lock_owner) |
113 | return; | 113 | return; |
114 | if (atomic_read(&eb->blocking_writers) == 0) { | 114 | if (eb->blocking_writers == 0) { |
115 | btrfs_assert_spinning_writers_put(eb); | 115 | btrfs_assert_spinning_writers_put(eb); |
116 | btrfs_assert_tree_locked(eb); | 116 | btrfs_assert_tree_locked(eb); |
117 | atomic_inc(&eb->blocking_writers); | 117 | eb->blocking_writers++; |
118 | write_unlock(&eb->lock); | 118 | write_unlock(&eb->lock); |
119 | } | 119 | } |
120 | } | 120 | } |
@@ -148,12 +148,11 @@ void btrfs_clear_lock_blocking_write(struct extent_buffer *eb) | |||
148 | */ | 148 | */ |
149 | if (eb->lock_nested && current->pid == eb->lock_owner) | 149 | if (eb->lock_nested && current->pid == eb->lock_owner) |
150 | return; | 150 | return; |
151 | BUG_ON(atomic_read(&eb->blocking_writers) != 1); | ||
152 | write_lock(&eb->lock); | 151 | write_lock(&eb->lock); |
152 | BUG_ON(eb->blocking_writers != 1); | ||
153 | btrfs_assert_spinning_writers_get(eb); | 153 | btrfs_assert_spinning_writers_get(eb); |
154 | /* atomic_dec_and_test implies a barrier */ | 154 | if (--eb->blocking_writers == 0) |
155 | if (atomic_dec_and_test(&eb->blocking_writers)) | 155 | cond_wake_up(&eb->write_lock_wq); |
156 | cond_wake_up_nomb(&eb->write_lock_wq); | ||
157 | } | 156 | } |
158 | 157 | ||
159 | /* | 158 | /* |
@@ -167,12 +166,10 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) | |||
167 | if (trace_btrfs_tree_read_lock_enabled()) | 166 | if (trace_btrfs_tree_read_lock_enabled()) |
168 | start_ns = ktime_get_ns(); | 167 | start_ns = ktime_get_ns(); |
169 | again: | 168 | again: |
170 | BUG_ON(!atomic_read(&eb->blocking_writers) && | ||
171 | current->pid == eb->lock_owner); | ||
172 | |||
173 | read_lock(&eb->lock); | 169 | read_lock(&eb->lock); |
174 | if (atomic_read(&eb->blocking_writers) && | 170 | BUG_ON(eb->blocking_writers == 0 && |
175 | current->pid == eb->lock_owner) { | 171 | current->pid == eb->lock_owner); |
172 | if (eb->blocking_writers && current->pid == eb->lock_owner) { | ||
176 | /* | 173 | /* |
177 | * This extent is already write-locked by our thread. We allow | 174 | * This extent is already write-locked by our thread. We allow |
178 | * an additional read lock to be added because it's for the same | 175 | * an additional read lock to be added because it's for the same |
@@ -185,10 +182,10 @@ again: | |||
185 | trace_btrfs_tree_read_lock(eb, start_ns); | 182 | trace_btrfs_tree_read_lock(eb, start_ns); |
186 | return; | 183 | return; |
187 | } | 184 | } |
188 | if (atomic_read(&eb->blocking_writers)) { | 185 | if (eb->blocking_writers) { |
189 | read_unlock(&eb->lock); | 186 | read_unlock(&eb->lock); |
190 | wait_event(eb->write_lock_wq, | 187 | wait_event(eb->write_lock_wq, |
191 | atomic_read(&eb->blocking_writers) == 0); | 188 | eb->blocking_writers == 0); |
192 | goto again; | 189 | goto again; |
193 | } | 190 | } |
194 | btrfs_assert_tree_read_locks_get(eb); | 191 | btrfs_assert_tree_read_locks_get(eb); |
@@ -203,11 +200,11 @@ again: | |||
203 | */ | 200 | */ |
204 | int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) | 201 | int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) |
205 | { | 202 | { |
206 | if (atomic_read(&eb->blocking_writers)) | 203 | if (eb->blocking_writers) |
207 | return 0; | 204 | return 0; |
208 | 205 | ||
209 | read_lock(&eb->lock); | 206 | read_lock(&eb->lock); |
210 | if (atomic_read(&eb->blocking_writers)) { | 207 | if (eb->blocking_writers) { |
211 | read_unlock(&eb->lock); | 208 | read_unlock(&eb->lock); |
212 | return 0; | 209 | return 0; |
213 | } | 210 | } |
@@ -223,13 +220,13 @@ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) | |||
223 | */ | 220 | */ |
224 | int btrfs_try_tree_read_lock(struct extent_buffer *eb) | 221 | int btrfs_try_tree_read_lock(struct extent_buffer *eb) |
225 | { | 222 | { |
226 | if (atomic_read(&eb->blocking_writers)) | 223 | if (eb->blocking_writers) |
227 | return 0; | 224 | return 0; |
228 | 225 | ||
229 | if (!read_trylock(&eb->lock)) | 226 | if (!read_trylock(&eb->lock)) |
230 | return 0; | 227 | return 0; |
231 | 228 | ||
232 | if (atomic_read(&eb->blocking_writers)) { | 229 | if (eb->blocking_writers) { |
233 | read_unlock(&eb->lock); | 230 | read_unlock(&eb->lock); |
234 | return 0; | 231 | return 0; |
235 | } | 232 | } |
@@ -245,13 +242,11 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) | |||
245 | */ | 242 | */ |
246 | int btrfs_try_tree_write_lock(struct extent_buffer *eb) | 243 | int btrfs_try_tree_write_lock(struct extent_buffer *eb) |
247 | { | 244 | { |
248 | if (atomic_read(&eb->blocking_writers) || | 245 | if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) |
249 | atomic_read(&eb->blocking_readers)) | ||
250 | return 0; | 246 | return 0; |
251 | 247 | ||
252 | write_lock(&eb->lock); | 248 | write_lock(&eb->lock); |
253 | if (atomic_read(&eb->blocking_writers) || | 249 | if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) { |
254 | atomic_read(&eb->blocking_readers)) { | ||
255 | write_unlock(&eb->lock); | 250 | write_unlock(&eb->lock); |
256 | return 0; | 251 | return 0; |
257 | } | 252 | } |
@@ -322,10 +317,9 @@ void btrfs_tree_lock(struct extent_buffer *eb) | |||
322 | WARN_ON(eb->lock_owner == current->pid); | 317 | WARN_ON(eb->lock_owner == current->pid); |
323 | again: | 318 | again: |
324 | wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); | 319 | wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); |
325 | wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); | 320 | wait_event(eb->write_lock_wq, eb->blocking_writers == 0); |
326 | write_lock(&eb->lock); | 321 | write_lock(&eb->lock); |
327 | if (atomic_read(&eb->blocking_readers) || | 322 | if (atomic_read(&eb->blocking_readers) || eb->blocking_writers) { |
328 | atomic_read(&eb->blocking_writers)) { | ||
329 | write_unlock(&eb->lock); | 323 | write_unlock(&eb->lock); |
330 | goto again; | 324 | goto again; |
331 | } | 325 | } |
@@ -340,7 +334,7 @@ again: | |||
340 | */ | 334 | */ |
341 | void btrfs_tree_unlock(struct extent_buffer *eb) | 335 | void btrfs_tree_unlock(struct extent_buffer *eb) |
342 | { | 336 | { |
343 | int blockers = atomic_read(&eb->blocking_writers); | 337 | int blockers = eb->blocking_writers; |
344 | 338 | ||
345 | BUG_ON(blockers > 1); | 339 | BUG_ON(blockers > 1); |
346 | 340 | ||
@@ -351,7 +345,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb) | |||
351 | 345 | ||
352 | if (blockers) { | 346 | if (blockers) { |
353 | btrfs_assert_no_spinning_writers(eb); | 347 | btrfs_assert_no_spinning_writers(eb); |
354 | atomic_dec(&eb->blocking_writers); | 348 | eb->blocking_writers--; |
355 | /* Use the lighter barrier after atomic */ | 349 | /* Use the lighter barrier after atomic */ |
356 | smp_mb__after_atomic(); | 350 | smp_mb__after_atomic(); |
357 | cond_wake_up_nomb(&eb->write_lock_wq); | 351 | cond_wake_up_nomb(&eb->write_lock_wq); |
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 52889da69113..1744ba8b2754 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include "extent_io.h" | 13 | #include "extent_io.h" |
14 | #include "disk-io.h" | 14 | #include "disk-io.h" |
15 | #include "compression.h" | 15 | #include "compression.h" |
16 | #include "delalloc-space.h" | ||
16 | 17 | ||
17 | static struct kmem_cache *btrfs_ordered_extent_cache; | 18 | static struct kmem_cache *btrfs_ordered_extent_cache; |
18 | 19 | ||
@@ -924,14 +925,16 @@ out: | |||
924 | * be reclaimed before their checksum is actually put into the btree | 925 | * be reclaimed before their checksum is actually put into the btree |
925 | */ | 926 | */ |
926 | int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, | 927 | int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, |
927 | u32 *sum, int len) | 928 | u8 *sum, int len) |
928 | { | 929 | { |
930 | struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); | ||
929 | struct btrfs_ordered_sum *ordered_sum; | 931 | struct btrfs_ordered_sum *ordered_sum; |
930 | struct btrfs_ordered_extent *ordered; | 932 | struct btrfs_ordered_extent *ordered; |
931 | struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; | 933 | struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; |
932 | unsigned long num_sectors; | 934 | unsigned long num_sectors; |
933 | unsigned long i; | 935 | unsigned long i; |
934 | u32 sectorsize = btrfs_inode_sectorsize(inode); | 936 | u32 sectorsize = btrfs_inode_sectorsize(inode); |
937 | const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); | ||
935 | int index = 0; | 938 | int index = 0; |
936 | 939 | ||
937 | ordered = btrfs_lookup_ordered_extent(inode, offset); | 940 | ordered = btrfs_lookup_ordered_extent(inode, offset); |
@@ -947,10 +950,10 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, | |||
947 | num_sectors = ordered_sum->len >> | 950 | num_sectors = ordered_sum->len >> |
948 | inode->i_sb->s_blocksize_bits; | 951 | inode->i_sb->s_blocksize_bits; |
949 | num_sectors = min_t(int, len - index, num_sectors - i); | 952 | num_sectors = min_t(int, len - index, num_sectors - i); |
950 | memcpy(sum + index, ordered_sum->sums + i, | 953 | memcpy(sum + index, ordered_sum->sums + i * csum_size, |
951 | num_sectors); | 954 | num_sectors * csum_size); |
952 | 955 | ||
953 | index += (int)num_sectors; | 956 | index += (int)num_sectors * csum_size; |
954 | if (index == len) | 957 | if (index == len) |
955 | goto out; | 958 | goto out; |
956 | disk_bytenr += num_sectors * sectorsize; | 959 | disk_bytenr += num_sectors * sectorsize; |
@@ -962,6 +965,51 @@ out: | |||
962 | return index; | 965 | return index; |
963 | } | 966 | } |
964 | 967 | ||
968 | /* | ||
969 | * btrfs_flush_ordered_range - Lock the passed range and ensures all pending | ||
970 | * ordered extents in it are run to completion. | ||
971 | * | ||
972 | * @tree: IO tree used for locking out other users of the range | ||
973 | * @inode: Inode whose ordered tree is to be searched | ||
974 | * @start: Beginning of range to flush | ||
975 | * @end: Last byte of range to lock | ||
976 | * @cached_state: If passed, will return the extent state responsible for the | ||
977 | * locked range. It's the caller's responsibility to free the cached state. | ||
978 | * | ||
979 | * This function always returns with the given range locked, ensuring after it's | ||
980 | * called no order extent can be pending. | ||
981 | */ | ||
982 | void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, | ||
983 | struct btrfs_inode *inode, u64 start, | ||
984 | u64 end, | ||
985 | struct extent_state **cached_state) | ||
986 | { | ||
987 | struct btrfs_ordered_extent *ordered; | ||
988 | struct extent_state *cachedp = NULL; | ||
989 | |||
990 | if (cached_state) | ||
991 | cachedp = *cached_state; | ||
992 | |||
993 | while (1) { | ||
994 | lock_extent_bits(tree, start, end, &cachedp); | ||
995 | ordered = btrfs_lookup_ordered_range(inode, start, | ||
996 | end - start + 1); | ||
997 | if (!ordered) { | ||
998 | /* | ||
999 | * If no external cached_state has been passed then | ||
1000 | * decrement the extra ref taken for cachedp since we | ||
1001 | * aren't exposing it outside of this function | ||
1002 | */ | ||
1003 | if (!cached_state) | ||
1004 | refcount_dec(&cachedp->refs); | ||
1005 | break; | ||
1006 | } | ||
1007 | unlock_extent_cached(tree, start, end, &cachedp); | ||
1008 | btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); | ||
1009 | btrfs_put_ordered_extent(ordered); | ||
1010 | } | ||
1011 | } | ||
1012 | |||
965 | int __init ordered_data_init(void) | 1013 | int __init ordered_data_init(void) |
966 | { | 1014 | { |
967 | btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", | 1015 | btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", |
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4c5991c3de14..5204171ea962 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h | |||
@@ -23,7 +23,7 @@ struct btrfs_ordered_sum { | |||
23 | int len; | 23 | int len; |
24 | struct list_head list; | 24 | struct list_head list; |
25 | /* last field is a variable length array of csums */ | 25 | /* last field is a variable length array of csums */ |
26 | u32 sums[]; | 26 | u8 sums[]; |
27 | }; | 27 | }; |
28 | 28 | ||
29 | /* | 29 | /* |
@@ -183,11 +183,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( | |||
183 | int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, | 183 | int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, |
184 | struct btrfs_ordered_extent *ordered); | 184 | struct btrfs_ordered_extent *ordered); |
185 | int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, | 185 | int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, |
186 | u32 *sum, int len); | 186 | u8 *sum, int len); |
187 | u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, | 187 | u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, |
188 | const u64 range_start, const u64 range_len); | 188 | const u64 range_start, const u64 range_len); |
189 | u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, | 189 | u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, |
190 | const u64 range_start, const u64 range_len); | 190 | const u64 range_start, const u64 range_len); |
191 | void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, | ||
192 | struct btrfs_inode *inode, u64 start, | ||
193 | u64 end, | ||
194 | struct extent_state **cached_state); | ||
191 | int __init ordered_data_init(void); | 195 | int __init ordered_data_init(void); |
192 | void __cold ordered_data_exit(void); | 196 | void __cold ordered_data_exit(void); |
193 | 197 | ||
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 1141ca5fae6a..9cb50577d982 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c | |||
@@ -153,11 +153,11 @@ static void print_eb_refs_lock(struct extent_buffer *eb) | |||
153 | #ifdef CONFIG_BTRFS_DEBUG | 153 | #ifdef CONFIG_BTRFS_DEBUG |
154 | btrfs_info(eb->fs_info, | 154 | btrfs_info(eb->fs_info, |
155 | "refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u", | 155 | "refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u", |
156 | atomic_read(&eb->refs), atomic_read(&eb->write_locks), | 156 | atomic_read(&eb->refs), eb->write_locks, |
157 | atomic_read(&eb->read_locks), | 157 | atomic_read(&eb->read_locks), |
158 | atomic_read(&eb->blocking_writers), | 158 | eb->blocking_writers, |
159 | atomic_read(&eb->blocking_readers), | 159 | atomic_read(&eb->blocking_readers), |
160 | atomic_read(&eb->spinning_writers), | 160 | eb->spinning_writers, |
161 | atomic_read(&eb->spinning_readers), | 161 | atomic_read(&eb->spinning_readers), |
162 | eb->lock_owner, current->pid); | 162 | eb->lock_owner, current->pid); |
163 | #endif | 163 | #endif |
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index a9e2e66152ee..e0469816c678 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c | |||
@@ -257,11 +257,7 @@ static int prop_compression_validate(const char *value, size_t len) | |||
257 | if (!value) | 257 | if (!value) |
258 | return 0; | 258 | return 0; |
259 | 259 | ||
260 | if (!strncmp("lzo", value, 3)) | 260 | if (btrfs_compress_is_valid_type(value, len)) |
261 | return 0; | ||
262 | else if (!strncmp("zlib", value, 4)) | ||
263 | return 0; | ||
264 | else if (!strncmp("zstd", value, 4)) | ||
265 | return 0; | 261 | return 0; |
266 | 262 | ||
267 | return -EINVAL; | 263 | return -EINVAL; |
@@ -341,7 +337,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, | |||
341 | for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { | 337 | for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { |
342 | const struct prop_handler *h = &prop_handlers[i]; | 338 | const struct prop_handler *h = &prop_handlers[i]; |
343 | const char *value; | 339 | const char *value; |
344 | u64 num_bytes; | 340 | u64 num_bytes = 0; |
345 | 341 | ||
346 | if (!h->inheritable) | 342 | if (!h->inheritable) |
347 | continue; | 343 | continue; |
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 3e6ffbbd8b0a..f8a3c1b0a15a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c | |||
@@ -2614,6 +2614,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, | |||
2614 | int ret = 0; | 2614 | int ret = 0; |
2615 | int i; | 2615 | int i; |
2616 | u64 *i_qgroups; | 2616 | u64 *i_qgroups; |
2617 | bool committing = false; | ||
2617 | struct btrfs_fs_info *fs_info = trans->fs_info; | 2618 | struct btrfs_fs_info *fs_info = trans->fs_info; |
2618 | struct btrfs_root *quota_root; | 2619 | struct btrfs_root *quota_root; |
2619 | struct btrfs_qgroup *srcgroup; | 2620 | struct btrfs_qgroup *srcgroup; |
@@ -2621,7 +2622,25 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, | |||
2621 | u32 level_size = 0; | 2622 | u32 level_size = 0; |
2622 | u64 nums; | 2623 | u64 nums; |
2623 | 2624 | ||
2624 | mutex_lock(&fs_info->qgroup_ioctl_lock); | 2625 | /* |
2626 | * There are only two callers of this function. | ||
2627 | * | ||
2628 | * One in create_subvol() in the ioctl context, which needs to hold | ||
2629 | * the qgroup_ioctl_lock. | ||
2630 | * | ||
2631 | * The other one in create_pending_snapshot() where no other qgroup | ||
2632 | * code can modify the fs as they all need to either start a new trans | ||
2633 | * or hold a trans handler, thus we don't need to hold | ||
2634 | * qgroup_ioctl_lock. | ||
2635 | * This would avoid long and complex lock chain and make lockdep happy. | ||
2636 | */ | ||
2637 | spin_lock(&fs_info->trans_lock); | ||
2638 | if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) | ||
2639 | committing = true; | ||
2640 | spin_unlock(&fs_info->trans_lock); | ||
2641 | |||
2642 | if (!committing) | ||
2643 | mutex_lock(&fs_info->qgroup_ioctl_lock); | ||
2625 | if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) | 2644 | if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) |
2626 | goto out; | 2645 | goto out; |
2627 | 2646 | ||
@@ -2785,7 +2804,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, | |||
2785 | unlock: | 2804 | unlock: |
2786 | spin_unlock(&fs_info->qgroup_lock); | 2805 | spin_unlock(&fs_info->qgroup_lock); |
2787 | out: | 2806 | out: |
2788 | mutex_unlock(&fs_info->qgroup_ioctl_lock); | 2807 | if (!committing) |
2808 | mutex_unlock(&fs_info->qgroup_ioctl_lock); | ||
2789 | return ret; | 2809 | return ret; |
2790 | } | 2810 | } |
2791 | 2811 | ||
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index f5d4c13a8dbc..2503485db859 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h | |||
@@ -7,7 +7,7 @@ | |||
7 | #ifndef BTRFS_RAID56_H | 7 | #ifndef BTRFS_RAID56_H |
8 | #define BTRFS_RAID56_H | 8 | #define BTRFS_RAID56_H |
9 | 9 | ||
10 | static inline int nr_parity_stripes(struct map_lookup *map) | 10 | static inline int nr_parity_stripes(const struct map_lookup *map) |
11 | { | 11 | { |
12 | if (map->type & BTRFS_BLOCK_GROUP_RAID5) | 12 | if (map->type & BTRFS_BLOCK_GROUP_RAID5) |
13 | return 1; | 13 | return 1; |
@@ -17,7 +17,7 @@ static inline int nr_parity_stripes(struct map_lookup *map) | |||
17 | return 0; | 17 | return 0; |
18 | } | 18 | } |
19 | 19 | ||
20 | static inline int nr_data_stripes(struct map_lookup *map) | 20 | static inline int nr_data_stripes(const struct map_lookup *map) |
21 | { | 21 | { |
22 | return map->num_stripes - nr_parity_stripes(map); | 22 | return map->num_stripes - nr_parity_stripes(map); |
23 | } | 23 | } |
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 22a3c69864fa..7f219851fa23 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include "inode-map.h" | 20 | #include "inode-map.h" |
21 | #include "qgroup.h" | 21 | #include "qgroup.h" |
22 | #include "print-tree.h" | 22 | #include "print-tree.h" |
23 | #include "delalloc-space.h" | ||
23 | 24 | ||
24 | /* | 25 | /* |
25 | * backref_node, mapping_node and tree_block start with this | 26 | * backref_node, mapping_node and tree_block start with this |
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 22124122728c..47733fb55df7 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c | |||
@@ -9,6 +9,8 @@ | |||
9 | #include "transaction.h" | 9 | #include "transaction.h" |
10 | #include "disk-io.h" | 10 | #include "disk-io.h" |
11 | #include "print-tree.h" | 11 | #include "print-tree.h" |
12 | #include "qgroup.h" | ||
13 | #include "space-info.h" | ||
12 | 14 | ||
13 | /* | 15 | /* |
14 | * Read a root item from the tree. In case we detect a root item smaller then | 16 | * Read a root item from the tree. In case we detect a root item smaller then |
@@ -497,3 +499,57 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, | |||
497 | btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec); | 499 | btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec); |
498 | spin_unlock(&root->root_item_lock); | 500 | spin_unlock(&root->root_item_lock); |
499 | } | 501 | } |
502 | |||
503 | /* | ||
504 | * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation | ||
505 | * root: the root of the parent directory | ||
506 | * rsv: block reservation | ||
507 | * items: the number of items that we need do reservation | ||
508 | * use_global_rsv: allow fallback to the global block reservation | ||
509 | * | ||
510 | * This function is used to reserve the space for snapshot/subvolume | ||
511 | * creation and deletion. Those operations are different with the | ||
512 | * common file/directory operations, they change two fs/file trees | ||
513 | * and root tree, the number of items that the qgroup reserves is | ||
514 | * different with the free space reservation. So we can not use | ||
515 | * the space reservation mechanism in start_transaction(). | ||
516 | */ | ||
517 | int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, | ||
518 | struct btrfs_block_rsv *rsv, int items, | ||
519 | bool use_global_rsv) | ||
520 | { | ||
521 | u64 qgroup_num_bytes = 0; | ||
522 | u64 num_bytes; | ||
523 | int ret; | ||
524 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
525 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
526 | |||
527 | if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { | ||
528 | /* One for parent inode, two for dir entries */ | ||
529 | qgroup_num_bytes = 3 * fs_info->nodesize; | ||
530 | ret = btrfs_qgroup_reserve_meta_prealloc(root, | ||
531 | qgroup_num_bytes, true); | ||
532 | if (ret) | ||
533 | return ret; | ||
534 | } | ||
535 | |||
536 | num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); | ||
537 | rsv->space_info = btrfs_find_space_info(fs_info, | ||
538 | BTRFS_BLOCK_GROUP_METADATA); | ||
539 | ret = btrfs_block_rsv_add(root, rsv, num_bytes, | ||
540 | BTRFS_RESERVE_FLUSH_ALL); | ||
541 | |||
542 | if (ret == -ENOSPC && use_global_rsv) | ||
543 | ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); | ||
544 | |||
545 | if (ret && qgroup_num_bytes) | ||
546 | btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); | ||
547 | |||
548 | return ret; | ||
549 | } | ||
550 | |||
551 | void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, | ||
552 | struct btrfs_block_rsv *rsv) | ||
553 | { | ||
554 | btrfs_block_rsv_release(fs_info, rsv, (u64)-1); | ||
555 | } | ||
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f7b29f9db5e2..0c99cf9fb595 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/blkdev.h> | 6 | #include <linux/blkdev.h> |
7 | #include <linux/ratelimit.h> | 7 | #include <linux/ratelimit.h> |
8 | #include <linux/sched/mm.h> | 8 | #include <linux/sched/mm.h> |
9 | #include <crypto/hash.h> | ||
9 | #include "ctree.h" | 10 | #include "ctree.h" |
10 | #include "volumes.h" | 11 | #include "volumes.h" |
11 | #include "disk-io.h" | 12 | #include "disk-io.h" |
@@ -1787,11 +1788,12 @@ static int scrub_checksum(struct scrub_block *sblock) | |||
1787 | static int scrub_checksum_data(struct scrub_block *sblock) | 1788 | static int scrub_checksum_data(struct scrub_block *sblock) |
1788 | { | 1789 | { |
1789 | struct scrub_ctx *sctx = sblock->sctx; | 1790 | struct scrub_ctx *sctx = sblock->sctx; |
1791 | struct btrfs_fs_info *fs_info = sctx->fs_info; | ||
1792 | SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); | ||
1790 | u8 csum[BTRFS_CSUM_SIZE]; | 1793 | u8 csum[BTRFS_CSUM_SIZE]; |
1791 | u8 *on_disk_csum; | 1794 | u8 *on_disk_csum; |
1792 | struct page *page; | 1795 | struct page *page; |
1793 | void *buffer; | 1796 | void *buffer; |
1794 | u32 crc = ~(u32)0; | ||
1795 | u64 len; | 1797 | u64 len; |
1796 | int index; | 1798 | int index; |
1797 | 1799 | ||
@@ -1799,6 +1801,9 @@ static int scrub_checksum_data(struct scrub_block *sblock) | |||
1799 | if (!sblock->pagev[0]->have_csum) | 1801 | if (!sblock->pagev[0]->have_csum) |
1800 | return 0; | 1802 | return 0; |
1801 | 1803 | ||
1804 | shash->tfm = fs_info->csum_shash; | ||
1805 | crypto_shash_init(shash); | ||
1806 | |||
1802 | on_disk_csum = sblock->pagev[0]->csum; | 1807 | on_disk_csum = sblock->pagev[0]->csum; |
1803 | page = sblock->pagev[0]->page; | 1808 | page = sblock->pagev[0]->page; |
1804 | buffer = kmap_atomic(page); | 1809 | buffer = kmap_atomic(page); |
@@ -1808,7 +1813,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) | |||
1808 | for (;;) { | 1813 | for (;;) { |
1809 | u64 l = min_t(u64, len, PAGE_SIZE); | 1814 | u64 l = min_t(u64, len, PAGE_SIZE); |
1810 | 1815 | ||
1811 | crc = btrfs_csum_data(buffer, crc, l); | 1816 | crypto_shash_update(shash, buffer, l); |
1812 | kunmap_atomic(buffer); | 1817 | kunmap_atomic(buffer); |
1813 | len -= l; | 1818 | len -= l; |
1814 | if (len == 0) | 1819 | if (len == 0) |
@@ -1820,7 +1825,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) | |||
1820 | buffer = kmap_atomic(page); | 1825 | buffer = kmap_atomic(page); |
1821 | } | 1826 | } |
1822 | 1827 | ||
1823 | btrfs_csum_final(crc, csum); | 1828 | crypto_shash_final(shash, csum); |
1824 | if (memcmp(csum, on_disk_csum, sctx->csum_size)) | 1829 | if (memcmp(csum, on_disk_csum, sctx->csum_size)) |
1825 | sblock->checksum_error = 1; | 1830 | sblock->checksum_error = 1; |
1826 | 1831 | ||
@@ -1832,16 +1837,19 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) | |||
1832 | struct scrub_ctx *sctx = sblock->sctx; | 1837 | struct scrub_ctx *sctx = sblock->sctx; |
1833 | struct btrfs_header *h; | 1838 | struct btrfs_header *h; |
1834 | struct btrfs_fs_info *fs_info = sctx->fs_info; | 1839 | struct btrfs_fs_info *fs_info = sctx->fs_info; |
1840 | SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); | ||
1835 | u8 calculated_csum[BTRFS_CSUM_SIZE]; | 1841 | u8 calculated_csum[BTRFS_CSUM_SIZE]; |
1836 | u8 on_disk_csum[BTRFS_CSUM_SIZE]; | 1842 | u8 on_disk_csum[BTRFS_CSUM_SIZE]; |
1837 | struct page *page; | 1843 | struct page *page; |
1838 | void *mapped_buffer; | 1844 | void *mapped_buffer; |
1839 | u64 mapped_size; | 1845 | u64 mapped_size; |
1840 | void *p; | 1846 | void *p; |
1841 | u32 crc = ~(u32)0; | ||
1842 | u64 len; | 1847 | u64 len; |
1843 | int index; | 1848 | int index; |
1844 | 1849 | ||
1850 | shash->tfm = fs_info->csum_shash; | ||
1851 | crypto_shash_init(shash); | ||
1852 | |||
1845 | BUG_ON(sblock->page_count < 1); | 1853 | BUG_ON(sblock->page_count < 1); |
1846 | page = sblock->pagev[0]->page; | 1854 | page = sblock->pagev[0]->page; |
1847 | mapped_buffer = kmap_atomic(page); | 1855 | mapped_buffer = kmap_atomic(page); |
@@ -1875,7 +1883,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) | |||
1875 | for (;;) { | 1883 | for (;;) { |
1876 | u64 l = min_t(u64, len, mapped_size); | 1884 | u64 l = min_t(u64, len, mapped_size); |
1877 | 1885 | ||
1878 | crc = btrfs_csum_data(p, crc, l); | 1886 | crypto_shash_update(shash, p, l); |
1879 | kunmap_atomic(mapped_buffer); | 1887 | kunmap_atomic(mapped_buffer); |
1880 | len -= l; | 1888 | len -= l; |
1881 | if (len == 0) | 1889 | if (len == 0) |
@@ -1889,7 +1897,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) | |||
1889 | p = mapped_buffer; | 1897 | p = mapped_buffer; |
1890 | } | 1898 | } |
1891 | 1899 | ||
1892 | btrfs_csum_final(crc, calculated_csum); | 1900 | crypto_shash_final(shash, calculated_csum); |
1893 | if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) | 1901 | if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) |
1894 | sblock->checksum_error = 1; | 1902 | sblock->checksum_error = 1; |
1895 | 1903 | ||
@@ -1900,18 +1908,22 @@ static int scrub_checksum_super(struct scrub_block *sblock) | |||
1900 | { | 1908 | { |
1901 | struct btrfs_super_block *s; | 1909 | struct btrfs_super_block *s; |
1902 | struct scrub_ctx *sctx = sblock->sctx; | 1910 | struct scrub_ctx *sctx = sblock->sctx; |
1911 | struct btrfs_fs_info *fs_info = sctx->fs_info; | ||
1912 | SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); | ||
1903 | u8 calculated_csum[BTRFS_CSUM_SIZE]; | 1913 | u8 calculated_csum[BTRFS_CSUM_SIZE]; |
1904 | u8 on_disk_csum[BTRFS_CSUM_SIZE]; | 1914 | u8 on_disk_csum[BTRFS_CSUM_SIZE]; |
1905 | struct page *page; | 1915 | struct page *page; |
1906 | void *mapped_buffer; | 1916 | void *mapped_buffer; |
1907 | u64 mapped_size; | 1917 | u64 mapped_size; |
1908 | void *p; | 1918 | void *p; |
1909 | u32 crc = ~(u32)0; | ||
1910 | int fail_gen = 0; | 1919 | int fail_gen = 0; |
1911 | int fail_cor = 0; | 1920 | int fail_cor = 0; |
1912 | u64 len; | 1921 | u64 len; |
1913 | int index; | 1922 | int index; |
1914 | 1923 | ||
1924 | shash->tfm = fs_info->csum_shash; | ||
1925 | crypto_shash_init(shash); | ||
1926 | |||
1915 | BUG_ON(sblock->page_count < 1); | 1927 | BUG_ON(sblock->page_count < 1); |
1916 | page = sblock->pagev[0]->page; | 1928 | page = sblock->pagev[0]->page; |
1917 | mapped_buffer = kmap_atomic(page); | 1929 | mapped_buffer = kmap_atomic(page); |
@@ -1934,7 +1946,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) | |||
1934 | for (;;) { | 1946 | for (;;) { |
1935 | u64 l = min_t(u64, len, mapped_size); | 1947 | u64 l = min_t(u64, len, mapped_size); |
1936 | 1948 | ||
1937 | crc = btrfs_csum_data(p, crc, l); | 1949 | crypto_shash_update(shash, p, l); |
1938 | kunmap_atomic(mapped_buffer); | 1950 | kunmap_atomic(mapped_buffer); |
1939 | len -= l; | 1951 | len -= l; |
1940 | if (len == 0) | 1952 | if (len == 0) |
@@ -1948,7 +1960,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) | |||
1948 | p = mapped_buffer; | 1960 | p = mapped_buffer; |
1949 | } | 1961 | } |
1950 | 1962 | ||
1951 | btrfs_csum_final(crc, calculated_csum); | 1963 | crypto_shash_final(shash, calculated_csum); |
1952 | if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) | 1964 | if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) |
1953 | ++fail_cor; | 1965 | ++fail_cor; |
1954 | 1966 | ||
@@ -2448,7 +2460,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) | |||
2448 | ASSERT(index < UINT_MAX); | 2460 | ASSERT(index < UINT_MAX); |
2449 | 2461 | ||
2450 | num_sectors = sum->len / sctx->fs_info->sectorsize; | 2462 | num_sectors = sum->len / sctx->fs_info->sectorsize; |
2451 | memcpy(csum, sum->sums + index, sctx->csum_size); | 2463 | memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size); |
2452 | if (index == num_sectors - 1) { | 2464 | if (index == num_sectors - 1) { |
2453 | list_del(&sum->list); | 2465 | list_del(&sum->list); |
2454 | kfree(sum); | 2466 | kfree(sum); |
@@ -2660,18 +2672,18 @@ static int get_raid56_logic_offset(u64 physical, int num, | |||
2660 | u64 last_offset; | 2672 | u64 last_offset; |
2661 | u32 stripe_index; | 2673 | u32 stripe_index; |
2662 | u32 rot; | 2674 | u32 rot; |
2675 | const int data_stripes = nr_data_stripes(map); | ||
2663 | 2676 | ||
2664 | last_offset = (physical - map->stripes[num].physical) * | 2677 | last_offset = (physical - map->stripes[num].physical) * data_stripes; |
2665 | nr_data_stripes(map); | ||
2666 | if (stripe_start) | 2678 | if (stripe_start) |
2667 | *stripe_start = last_offset; | 2679 | *stripe_start = last_offset; |
2668 | 2680 | ||
2669 | *offset = last_offset; | 2681 | *offset = last_offset; |
2670 | for (i = 0; i < nr_data_stripes(map); i++) { | 2682 | for (i = 0; i < data_stripes; i++) { |
2671 | *offset = last_offset + i * map->stripe_len; | 2683 | *offset = last_offset + i * map->stripe_len; |
2672 | 2684 | ||
2673 | stripe_nr = div64_u64(*offset, map->stripe_len); | 2685 | stripe_nr = div64_u64(*offset, map->stripe_len); |
2674 | stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); | 2686 | stripe_nr = div_u64(stripe_nr, data_stripes); |
2675 | 2687 | ||
2676 | /* Work out the disk rotation on this stripe-set */ | 2688 | /* Work out the disk rotation on this stripe-set */ |
2677 | stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); | 2689 | stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); |
@@ -3079,7 +3091,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, | |||
3079 | offset = map->stripe_len * (num / map->sub_stripes); | 3091 | offset = map->stripe_len * (num / map->sub_stripes); |
3080 | increment = map->stripe_len * factor; | 3092 | increment = map->stripe_len * factor; |
3081 | mirror_num = num % map->sub_stripes + 1; | 3093 | mirror_num = num % map->sub_stripes + 1; |
3082 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { | 3094 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { |
3083 | increment = map->stripe_len; | 3095 | increment = map->stripe_len; |
3084 | mirror_num = num % map->num_stripes + 1; | 3096 | mirror_num = num % map->num_stripes + 1; |
3085 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { | 3097 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { |
@@ -3410,15 +3422,15 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, | |||
3410 | struct btrfs_block_group_cache *cache) | 3422 | struct btrfs_block_group_cache *cache) |
3411 | { | 3423 | { |
3412 | struct btrfs_fs_info *fs_info = sctx->fs_info; | 3424 | struct btrfs_fs_info *fs_info = sctx->fs_info; |
3413 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | 3425 | struct extent_map_tree *map_tree = &fs_info->mapping_tree; |
3414 | struct map_lookup *map; | 3426 | struct map_lookup *map; |
3415 | struct extent_map *em; | 3427 | struct extent_map *em; |
3416 | int i; | 3428 | int i; |
3417 | int ret = 0; | 3429 | int ret = 0; |
3418 | 3430 | ||
3419 | read_lock(&map_tree->map_tree.lock); | 3431 | read_lock(&map_tree->lock); |
3420 | em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); | 3432 | em = lookup_extent_mapping(map_tree, chunk_offset, 1); |
3421 | read_unlock(&map_tree->map_tree.lock); | 3433 | read_unlock(&map_tree->lock); |
3422 | 3434 | ||
3423 | if (!em) { | 3435 | if (!em) { |
3424 | /* | 3436 | /* |
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f7fe4770f0e5..69b59bf75882 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c | |||
@@ -686,7 +686,7 @@ static int send_cmd(struct send_ctx *sctx) | |||
686 | hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); | 686 | hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); |
687 | hdr->crc = 0; | 687 | hdr->crc = 0; |
688 | 688 | ||
689 | crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); | 689 | crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); |
690 | hdr->crc = cpu_to_le32(crc); | 690 | hdr->crc = cpu_to_le32(crc); |
691 | 691 | ||
692 | ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, | 692 | ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, |
@@ -6929,9 +6929,23 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) | |||
6929 | if (ret) | 6929 | if (ret) |
6930 | goto out; | 6930 | goto out; |
6931 | 6931 | ||
6932 | mutex_lock(&fs_info->balance_mutex); | ||
6933 | if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { | ||
6934 | mutex_unlock(&fs_info->balance_mutex); | ||
6935 | btrfs_warn_rl(fs_info, | ||
6936 | "cannot run send because a balance operation is in progress"); | ||
6937 | ret = -EAGAIN; | ||
6938 | goto out; | ||
6939 | } | ||
6940 | fs_info->send_in_progress++; | ||
6941 | mutex_unlock(&fs_info->balance_mutex); | ||
6942 | |||
6932 | current->journal_info = BTRFS_SEND_TRANS_STUB; | 6943 | current->journal_info = BTRFS_SEND_TRANS_STUB; |
6933 | ret = send_subvol(sctx); | 6944 | ret = send_subvol(sctx); |
6934 | current->journal_info = NULL; | 6945 | current->journal_info = NULL; |
6946 | mutex_lock(&fs_info->balance_mutex); | ||
6947 | fs_info->send_in_progress--; | ||
6948 | mutex_unlock(&fs_info->balance_mutex); | ||
6935 | if (ret < 0) | 6949 | if (ret < 0) |
6936 | goto out; | 6950 | goto out; |
6937 | 6951 | ||
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c new file mode 100644 index 000000000000..ab7b9ec4c240 --- /dev/null +++ b/fs/btrfs/space-info.c | |||
@@ -0,0 +1,1094 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | #include "ctree.h" | ||
4 | #include "space-info.h" | ||
5 | #include "sysfs.h" | ||
6 | #include "volumes.h" | ||
7 | #include "free-space-cache.h" | ||
8 | #include "ordered-data.h" | ||
9 | #include "transaction.h" | ||
10 | #include "math.h" | ||
11 | |||
12 | u64 btrfs_space_info_used(struct btrfs_space_info *s_info, | ||
13 | bool may_use_included) | ||
14 | { | ||
15 | ASSERT(s_info); | ||
16 | return s_info->bytes_used + s_info->bytes_reserved + | ||
17 | s_info->bytes_pinned + s_info->bytes_readonly + | ||
18 | (may_use_included ? s_info->bytes_may_use : 0); | ||
19 | } | ||
20 | |||
21 | /* | ||
22 | * after adding space to the filesystem, we need to clear the full flags | ||
23 | * on all the space infos. | ||
24 | */ | ||
25 | void btrfs_clear_space_info_full(struct btrfs_fs_info *info) | ||
26 | { | ||
27 | struct list_head *head = &info->space_info; | ||
28 | struct btrfs_space_info *found; | ||
29 | |||
30 | rcu_read_lock(); | ||
31 | list_for_each_entry_rcu(found, head, list) | ||
32 | found->full = 0; | ||
33 | rcu_read_unlock(); | ||
34 | } | ||
35 | |||
36 | static const char *alloc_name(u64 flags) | ||
37 | { | ||
38 | switch (flags) { | ||
39 | case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: | ||
40 | return "mixed"; | ||
41 | case BTRFS_BLOCK_GROUP_METADATA: | ||
42 | return "metadata"; | ||
43 | case BTRFS_BLOCK_GROUP_DATA: | ||
44 | return "data"; | ||
45 | case BTRFS_BLOCK_GROUP_SYSTEM: | ||
46 | return "system"; | ||
47 | default: | ||
48 | WARN_ON(1); | ||
49 | return "invalid-combination"; | ||
50 | }; | ||
51 | } | ||
52 | |||
53 | static int create_space_info(struct btrfs_fs_info *info, u64 flags) | ||
54 | { | ||
55 | |||
56 | struct btrfs_space_info *space_info; | ||
57 | int i; | ||
58 | int ret; | ||
59 | |||
60 | space_info = kzalloc(sizeof(*space_info), GFP_NOFS); | ||
61 | if (!space_info) | ||
62 | return -ENOMEM; | ||
63 | |||
64 | ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, | ||
65 | GFP_KERNEL); | ||
66 | if (ret) { | ||
67 | kfree(space_info); | ||
68 | return ret; | ||
69 | } | ||
70 | |||
71 | for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) | ||
72 | INIT_LIST_HEAD(&space_info->block_groups[i]); | ||
73 | init_rwsem(&space_info->groups_sem); | ||
74 | spin_lock_init(&space_info->lock); | ||
75 | space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; | ||
76 | space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; | ||
77 | init_waitqueue_head(&space_info->wait); | ||
78 | INIT_LIST_HEAD(&space_info->ro_bgs); | ||
79 | INIT_LIST_HEAD(&space_info->tickets); | ||
80 | INIT_LIST_HEAD(&space_info->priority_tickets); | ||
81 | |||
82 | ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, | ||
83 | info->space_info_kobj, "%s", | ||
84 | alloc_name(space_info->flags)); | ||
85 | if (ret) { | ||
86 | kobject_put(&space_info->kobj); | ||
87 | return ret; | ||
88 | } | ||
89 | |||
90 | list_add_rcu(&space_info->list, &info->space_info); | ||
91 | if (flags & BTRFS_BLOCK_GROUP_DATA) | ||
92 | info->data_sinfo = space_info; | ||
93 | |||
94 | return ret; | ||
95 | } | ||
96 | |||
97 | int btrfs_init_space_info(struct btrfs_fs_info *fs_info) | ||
98 | { | ||
99 | struct btrfs_super_block *disk_super; | ||
100 | u64 features; | ||
101 | u64 flags; | ||
102 | int mixed = 0; | ||
103 | int ret; | ||
104 | |||
105 | disk_super = fs_info->super_copy; | ||
106 | if (!btrfs_super_root(disk_super)) | ||
107 | return -EINVAL; | ||
108 | |||
109 | features = btrfs_super_incompat_flags(disk_super); | ||
110 | if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) | ||
111 | mixed = 1; | ||
112 | |||
113 | flags = BTRFS_BLOCK_GROUP_SYSTEM; | ||
114 | ret = create_space_info(fs_info, flags); | ||
115 | if (ret) | ||
116 | goto out; | ||
117 | |||
118 | if (mixed) { | ||
119 | flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; | ||
120 | ret = create_space_info(fs_info, flags); | ||
121 | } else { | ||
122 | flags = BTRFS_BLOCK_GROUP_METADATA; | ||
123 | ret = create_space_info(fs_info, flags); | ||
124 | if (ret) | ||
125 | goto out; | ||
126 | |||
127 | flags = BTRFS_BLOCK_GROUP_DATA; | ||
128 | ret = create_space_info(fs_info, flags); | ||
129 | } | ||
130 | out: | ||
131 | return ret; | ||
132 | } | ||
133 | |||
134 | void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, | ||
135 | u64 total_bytes, u64 bytes_used, | ||
136 | u64 bytes_readonly, | ||
137 | struct btrfs_space_info **space_info) | ||
138 | { | ||
139 | struct btrfs_space_info *found; | ||
140 | int factor; | ||
141 | |||
142 | factor = btrfs_bg_type_to_factor(flags); | ||
143 | |||
144 | found = btrfs_find_space_info(info, flags); | ||
145 | ASSERT(found); | ||
146 | spin_lock(&found->lock); | ||
147 | found->total_bytes += total_bytes; | ||
148 | found->disk_total += total_bytes * factor; | ||
149 | found->bytes_used += bytes_used; | ||
150 | found->disk_used += bytes_used * factor; | ||
151 | found->bytes_readonly += bytes_readonly; | ||
152 | if (total_bytes > 0) | ||
153 | found->full = 0; | ||
154 | btrfs_space_info_add_new_bytes(info, found, | ||
155 | total_bytes - bytes_used - | ||
156 | bytes_readonly); | ||
157 | spin_unlock(&found->lock); | ||
158 | *space_info = found; | ||
159 | } | ||
160 | |||
161 | struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, | ||
162 | u64 flags) | ||
163 | { | ||
164 | struct list_head *head = &info->space_info; | ||
165 | struct btrfs_space_info *found; | ||
166 | |||
167 | flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; | ||
168 | |||
169 | rcu_read_lock(); | ||
170 | list_for_each_entry_rcu(found, head, list) { | ||
171 | if (found->flags & flags) { | ||
172 | rcu_read_unlock(); | ||
173 | return found; | ||
174 | } | ||
175 | } | ||
176 | rcu_read_unlock(); | ||
177 | return NULL; | ||
178 | } | ||
179 | |||
180 | static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) | ||
181 | { | ||
182 | return (global->size << 1); | ||
183 | } | ||
184 | |||
185 | static int can_overcommit(struct btrfs_fs_info *fs_info, | ||
186 | struct btrfs_space_info *space_info, u64 bytes, | ||
187 | enum btrfs_reserve_flush_enum flush, | ||
188 | bool system_chunk) | ||
189 | { | ||
190 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
191 | u64 profile; | ||
192 | u64 space_size; | ||
193 | u64 avail; | ||
194 | u64 used; | ||
195 | int factor; | ||
196 | |||
197 | /* Don't overcommit when in mixed mode. */ | ||
198 | if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) | ||
199 | return 0; | ||
200 | |||
201 | if (system_chunk) | ||
202 | profile = btrfs_system_alloc_profile(fs_info); | ||
203 | else | ||
204 | profile = btrfs_metadata_alloc_profile(fs_info); | ||
205 | |||
206 | used = btrfs_space_info_used(space_info, false); | ||
207 | |||
208 | /* | ||
209 | * We only want to allow over committing if we have lots of actual space | ||
210 | * free, but if we don't have enough space to handle the global reserve | ||
211 | * space then we could end up having a real enospc problem when trying | ||
212 | * to allocate a chunk or some other such important allocation. | ||
213 | */ | ||
214 | spin_lock(&global_rsv->lock); | ||
215 | space_size = calc_global_rsv_need_space(global_rsv); | ||
216 | spin_unlock(&global_rsv->lock); | ||
217 | if (used + space_size >= space_info->total_bytes) | ||
218 | return 0; | ||
219 | |||
220 | used += space_info->bytes_may_use; | ||
221 | |||
222 | avail = atomic64_read(&fs_info->free_chunk_space); | ||
223 | |||
224 | /* | ||
225 | * If we have dup, raid1 or raid10 then only half of the free | ||
226 | * space is actually usable. For raid56, the space info used | ||
227 | * doesn't include the parity drive, so we don't have to | ||
228 | * change the math | ||
229 | */ | ||
230 | factor = btrfs_bg_type_to_factor(profile); | ||
231 | avail = div_u64(avail, factor); | ||
232 | |||
233 | /* | ||
234 | * If we aren't flushing all things, let us overcommit up to | ||
235 | * 1/2th of the space. If we can flush, don't let us overcommit | ||
236 | * too much, let it overcommit up to 1/8 of the space. | ||
237 | */ | ||
238 | if (flush == BTRFS_RESERVE_FLUSH_ALL) | ||
239 | avail >>= 3; | ||
240 | else | ||
241 | avail >>= 1; | ||
242 | |||
243 | if (used + bytes < space_info->total_bytes + avail) | ||
244 | return 1; | ||
245 | return 0; | ||
246 | } | ||
247 | |||
248 | /* | ||
249 | * This is for space we already have accounted in space_info->bytes_may_use, so | ||
250 | * basically when we're returning space from block_rsv's. | ||
251 | */ | ||
252 | void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, | ||
253 | struct btrfs_space_info *space_info, | ||
254 | u64 num_bytes) | ||
255 | { | ||
256 | struct reserve_ticket *ticket; | ||
257 | struct list_head *head; | ||
258 | u64 used; | ||
259 | enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; | ||
260 | bool check_overcommit = false; | ||
261 | |||
262 | spin_lock(&space_info->lock); | ||
263 | head = &space_info->priority_tickets; | ||
264 | |||
265 | /* | ||
266 | * If we are over our limit then we need to check and see if we can | ||
267 | * overcommit, and if we can't then we just need to free up our space | ||
268 | * and not satisfy any requests. | ||
269 | */ | ||
270 | used = btrfs_space_info_used(space_info, true); | ||
271 | if (used - num_bytes >= space_info->total_bytes) | ||
272 | check_overcommit = true; | ||
273 | again: | ||
274 | while (!list_empty(head) && num_bytes) { | ||
275 | ticket = list_first_entry(head, struct reserve_ticket, | ||
276 | list); | ||
277 | /* | ||
278 | * We use 0 bytes because this space is already reserved, so | ||
279 | * adding the ticket space would be a double count. | ||
280 | */ | ||
281 | if (check_overcommit && | ||
282 | !can_overcommit(fs_info, space_info, 0, flush, false)) | ||
283 | break; | ||
284 | if (num_bytes >= ticket->bytes) { | ||
285 | list_del_init(&ticket->list); | ||
286 | num_bytes -= ticket->bytes; | ||
287 | ticket->bytes = 0; | ||
288 | space_info->tickets_id++; | ||
289 | wake_up(&ticket->wait); | ||
290 | } else { | ||
291 | ticket->bytes -= num_bytes; | ||
292 | num_bytes = 0; | ||
293 | } | ||
294 | } | ||
295 | |||
296 | if (num_bytes && head == &space_info->priority_tickets) { | ||
297 | head = &space_info->tickets; | ||
298 | flush = BTRFS_RESERVE_FLUSH_ALL; | ||
299 | goto again; | ||
300 | } | ||
301 | btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); | ||
302 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
303 | space_info->flags, num_bytes, 0); | ||
304 | spin_unlock(&space_info->lock); | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * This is for newly allocated space that isn't accounted in | ||
309 | * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent | ||
310 | * we use this helper. | ||
311 | */ | ||
312 | void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, | ||
313 | struct btrfs_space_info *space_info, | ||
314 | u64 num_bytes) | ||
315 | { | ||
316 | struct reserve_ticket *ticket; | ||
317 | struct list_head *head = &space_info->priority_tickets; | ||
318 | |||
319 | again: | ||
320 | while (!list_empty(head) && num_bytes) { | ||
321 | ticket = list_first_entry(head, struct reserve_ticket, | ||
322 | list); | ||
323 | if (num_bytes >= ticket->bytes) { | ||
324 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
325 | space_info->flags, | ||
326 | ticket->bytes, 1); | ||
327 | list_del_init(&ticket->list); | ||
328 | num_bytes -= ticket->bytes; | ||
329 | btrfs_space_info_update_bytes_may_use(fs_info, | ||
330 | space_info, | ||
331 | ticket->bytes); | ||
332 | ticket->bytes = 0; | ||
333 | space_info->tickets_id++; | ||
334 | wake_up(&ticket->wait); | ||
335 | } else { | ||
336 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
337 | space_info->flags, | ||
338 | num_bytes, 1); | ||
339 | btrfs_space_info_update_bytes_may_use(fs_info, | ||
340 | space_info, | ||
341 | num_bytes); | ||
342 | ticket->bytes -= num_bytes; | ||
343 | num_bytes = 0; | ||
344 | } | ||
345 | } | ||
346 | |||
347 | if (num_bytes && head == &space_info->priority_tickets) { | ||
348 | head = &space_info->tickets; | ||
349 | goto again; | ||
350 | } | ||
351 | } | ||
352 | |||
353 | #define DUMP_BLOCK_RSV(fs_info, rsv_name) \ | ||
354 | do { \ | ||
355 | struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ | ||
356 | spin_lock(&__rsv->lock); \ | ||
357 | btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ | ||
358 | __rsv->size, __rsv->reserved); \ | ||
359 | spin_unlock(&__rsv->lock); \ | ||
360 | } while (0) | ||
361 | |||
362 | void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, | ||
363 | struct btrfs_space_info *info, u64 bytes, | ||
364 | int dump_block_groups) | ||
365 | { | ||
366 | struct btrfs_block_group_cache *cache; | ||
367 | int index = 0; | ||
368 | |||
369 | spin_lock(&info->lock); | ||
370 | btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", | ||
371 | info->flags, | ||
372 | info->total_bytes - btrfs_space_info_used(info, true), | ||
373 | info->full ? "" : "not "); | ||
374 | btrfs_info(fs_info, | ||
375 | "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", | ||
376 | info->total_bytes, info->bytes_used, info->bytes_pinned, | ||
377 | info->bytes_reserved, info->bytes_may_use, | ||
378 | info->bytes_readonly); | ||
379 | spin_unlock(&info->lock); | ||
380 | |||
381 | DUMP_BLOCK_RSV(fs_info, global_block_rsv); | ||
382 | DUMP_BLOCK_RSV(fs_info, trans_block_rsv); | ||
383 | DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); | ||
384 | DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); | ||
385 | DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); | ||
386 | |||
387 | if (!dump_block_groups) | ||
388 | return; | ||
389 | |||
390 | down_read(&info->groups_sem); | ||
391 | again: | ||
392 | list_for_each_entry(cache, &info->block_groups[index], list) { | ||
393 | spin_lock(&cache->lock); | ||
394 | btrfs_info(fs_info, | ||
395 | "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", | ||
396 | cache->key.objectid, cache->key.offset, | ||
397 | btrfs_block_group_used(&cache->item), cache->pinned, | ||
398 | cache->reserved, cache->ro ? "[readonly]" : ""); | ||
399 | btrfs_dump_free_space(cache, bytes); | ||
400 | spin_unlock(&cache->lock); | ||
401 | } | ||
402 | if (++index < BTRFS_NR_RAID_TYPES) | ||
403 | goto again; | ||
404 | up_read(&info->groups_sem); | ||
405 | } | ||
406 | |||
407 | static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, | ||
408 | unsigned long nr_pages, int nr_items) | ||
409 | { | ||
410 | struct super_block *sb = fs_info->sb; | ||
411 | |||
412 | if (down_read_trylock(&sb->s_umount)) { | ||
413 | writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); | ||
414 | up_read(&sb->s_umount); | ||
415 | } else { | ||
416 | /* | ||
417 | * We needn't worry the filesystem going from r/w to r/o though | ||
418 | * we don't acquire ->s_umount mutex, because the filesystem | ||
419 | * should guarantee the delalloc inodes list be empty after | ||
420 | * the filesystem is readonly(all dirty pages are written to | ||
421 | * the disk). | ||
422 | */ | ||
423 | btrfs_start_delalloc_roots(fs_info, nr_items); | ||
424 | if (!current->journal_info) | ||
425 | btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); | ||
426 | } | ||
427 | } | ||
428 | |||
429 | static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, | ||
430 | u64 to_reclaim) | ||
431 | { | ||
432 | u64 bytes; | ||
433 | u64 nr; | ||
434 | |||
435 | bytes = btrfs_calc_trans_metadata_size(fs_info, 1); | ||
436 | nr = div64_u64(to_reclaim, bytes); | ||
437 | if (!nr) | ||
438 | nr = 1; | ||
439 | return nr; | ||
440 | } | ||
441 | |||
442 | #define EXTENT_SIZE_PER_ITEM SZ_256K | ||
443 | |||
444 | /* | ||
445 | * shrink metadata reservation for delalloc | ||
446 | */ | ||
447 | static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, | ||
448 | u64 orig, bool wait_ordered) | ||
449 | { | ||
450 | struct btrfs_space_info *space_info; | ||
451 | struct btrfs_trans_handle *trans; | ||
452 | u64 delalloc_bytes; | ||
453 | u64 dio_bytes; | ||
454 | u64 async_pages; | ||
455 | u64 items; | ||
456 | long time_left; | ||
457 | unsigned long nr_pages; | ||
458 | int loops; | ||
459 | |||
460 | /* Calc the number of the pages we need flush for space reservation */ | ||
461 | items = calc_reclaim_items_nr(fs_info, to_reclaim); | ||
462 | to_reclaim = items * EXTENT_SIZE_PER_ITEM; | ||
463 | |||
464 | trans = (struct btrfs_trans_handle *)current->journal_info; | ||
465 | space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); | ||
466 | |||
467 | delalloc_bytes = percpu_counter_sum_positive( | ||
468 | &fs_info->delalloc_bytes); | ||
469 | dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); | ||
470 | if (delalloc_bytes == 0 && dio_bytes == 0) { | ||
471 | if (trans) | ||
472 | return; | ||
473 | if (wait_ordered) | ||
474 | btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); | ||
475 | return; | ||
476 | } | ||
477 | |||
478 | /* | ||
479 | * If we are doing more ordered than delalloc we need to just wait on | ||
480 | * ordered extents, otherwise we'll waste time trying to flush delalloc | ||
481 | * that likely won't give us the space back we need. | ||
482 | */ | ||
483 | if (dio_bytes > delalloc_bytes) | ||
484 | wait_ordered = true; | ||
485 | |||
486 | loops = 0; | ||
487 | while ((delalloc_bytes || dio_bytes) && loops < 3) { | ||
488 | nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; | ||
489 | |||
490 | /* | ||
491 | * Triggers inode writeback for up to nr_pages. This will invoke | ||
492 | * ->writepages callback and trigger delalloc filling | ||
493 | * (btrfs_run_delalloc_range()). | ||
494 | */ | ||
495 | btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); | ||
496 | |||
497 | /* | ||
498 | * We need to wait for the compressed pages to start before | ||
499 | * we continue. | ||
500 | */ | ||
501 | async_pages = atomic_read(&fs_info->async_delalloc_pages); | ||
502 | if (!async_pages) | ||
503 | goto skip_async; | ||
504 | |||
505 | /* | ||
506 | * Calculate how many compressed pages we want to be written | ||
507 | * before we continue. I.e if there are more async pages than we | ||
508 | * require wait_event will wait until nr_pages are written. | ||
509 | */ | ||
510 | if (async_pages <= nr_pages) | ||
511 | async_pages = 0; | ||
512 | else | ||
513 | async_pages -= nr_pages; | ||
514 | |||
515 | wait_event(fs_info->async_submit_wait, | ||
516 | atomic_read(&fs_info->async_delalloc_pages) <= | ||
517 | (int)async_pages); | ||
518 | skip_async: | ||
519 | spin_lock(&space_info->lock); | ||
520 | if (list_empty(&space_info->tickets) && | ||
521 | list_empty(&space_info->priority_tickets)) { | ||
522 | spin_unlock(&space_info->lock); | ||
523 | break; | ||
524 | } | ||
525 | spin_unlock(&space_info->lock); | ||
526 | |||
527 | loops++; | ||
528 | if (wait_ordered && !trans) { | ||
529 | btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); | ||
530 | } else { | ||
531 | time_left = schedule_timeout_killable(1); | ||
532 | if (time_left) | ||
533 | break; | ||
534 | } | ||
535 | delalloc_bytes = percpu_counter_sum_positive( | ||
536 | &fs_info->delalloc_bytes); | ||
537 | dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); | ||
538 | } | ||
539 | } | ||
540 | |||
541 | /** | ||
542 | * maybe_commit_transaction - possibly commit the transaction if its ok to | ||
543 | * @root - the root we're allocating for | ||
544 | * @bytes - the number of bytes we want to reserve | ||
545 | * @force - force the commit | ||
546 | * | ||
547 | * This will check to make sure that committing the transaction will actually | ||
548 | * get us somewhere and then commit the transaction if it does. Otherwise it | ||
549 | * will return -ENOSPC. | ||
550 | */ | ||
551 | static int may_commit_transaction(struct btrfs_fs_info *fs_info, | ||
552 | struct btrfs_space_info *space_info) | ||
553 | { | ||
554 | struct reserve_ticket *ticket = NULL; | ||
555 | struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; | ||
556 | struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; | ||
557 | struct btrfs_trans_handle *trans; | ||
558 | u64 bytes_needed; | ||
559 | u64 reclaim_bytes = 0; | ||
560 | |||
561 | trans = (struct btrfs_trans_handle *)current->journal_info; | ||
562 | if (trans) | ||
563 | return -EAGAIN; | ||
564 | |||
565 | spin_lock(&space_info->lock); | ||
566 | if (!list_empty(&space_info->priority_tickets)) | ||
567 | ticket = list_first_entry(&space_info->priority_tickets, | ||
568 | struct reserve_ticket, list); | ||
569 | else if (!list_empty(&space_info->tickets)) | ||
570 | ticket = list_first_entry(&space_info->tickets, | ||
571 | struct reserve_ticket, list); | ||
572 | bytes_needed = (ticket) ? ticket->bytes : 0; | ||
573 | spin_unlock(&space_info->lock); | ||
574 | |||
575 | if (!bytes_needed) | ||
576 | return 0; | ||
577 | |||
578 | trans = btrfs_join_transaction(fs_info->extent_root); | ||
579 | if (IS_ERR(trans)) | ||
580 | return PTR_ERR(trans); | ||
581 | |||
582 | /* | ||
583 | * See if there is enough pinned space to make this reservation, or if | ||
584 | * we have block groups that are going to be freed, allowing us to | ||
585 | * possibly do a chunk allocation the next loop through. | ||
586 | */ | ||
587 | if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || | ||
588 | __percpu_counter_compare(&space_info->total_bytes_pinned, | ||
589 | bytes_needed, | ||
590 | BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) | ||
591 | goto commit; | ||
592 | |||
593 | /* | ||
594 | * See if there is some space in the delayed insertion reservation for | ||
595 | * this reservation. | ||
596 | */ | ||
597 | if (space_info != delayed_rsv->space_info) | ||
598 | goto enospc; | ||
599 | |||
600 | spin_lock(&delayed_rsv->lock); | ||
601 | reclaim_bytes += delayed_rsv->reserved; | ||
602 | spin_unlock(&delayed_rsv->lock); | ||
603 | |||
604 | spin_lock(&delayed_refs_rsv->lock); | ||
605 | reclaim_bytes += delayed_refs_rsv->reserved; | ||
606 | spin_unlock(&delayed_refs_rsv->lock); | ||
607 | if (reclaim_bytes >= bytes_needed) | ||
608 | goto commit; | ||
609 | bytes_needed -= reclaim_bytes; | ||
610 | |||
611 | if (__percpu_counter_compare(&space_info->total_bytes_pinned, | ||
612 | bytes_needed, | ||
613 | BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) | ||
614 | goto enospc; | ||
615 | |||
616 | commit: | ||
617 | return btrfs_commit_transaction(trans); | ||
618 | enospc: | ||
619 | btrfs_end_transaction(trans); | ||
620 | return -ENOSPC; | ||
621 | } | ||
622 | |||
623 | /* | ||
624 | * Try to flush some data based on policy set by @state. This is only advisory | ||
625 | * and may fail for various reasons. The caller is supposed to examine the | ||
626 | * state of @space_info to detect the outcome. | ||
627 | */ | ||
628 | static void flush_space(struct btrfs_fs_info *fs_info, | ||
629 | struct btrfs_space_info *space_info, u64 num_bytes, | ||
630 | int state) | ||
631 | { | ||
632 | struct btrfs_root *root = fs_info->extent_root; | ||
633 | struct btrfs_trans_handle *trans; | ||
634 | int nr; | ||
635 | int ret = 0; | ||
636 | |||
637 | switch (state) { | ||
638 | case FLUSH_DELAYED_ITEMS_NR: | ||
639 | case FLUSH_DELAYED_ITEMS: | ||
640 | if (state == FLUSH_DELAYED_ITEMS_NR) | ||
641 | nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; | ||
642 | else | ||
643 | nr = -1; | ||
644 | |||
645 | trans = btrfs_join_transaction(root); | ||
646 | if (IS_ERR(trans)) { | ||
647 | ret = PTR_ERR(trans); | ||
648 | break; | ||
649 | } | ||
650 | ret = btrfs_run_delayed_items_nr(trans, nr); | ||
651 | btrfs_end_transaction(trans); | ||
652 | break; | ||
653 | case FLUSH_DELALLOC: | ||
654 | case FLUSH_DELALLOC_WAIT: | ||
655 | shrink_delalloc(fs_info, num_bytes * 2, num_bytes, | ||
656 | state == FLUSH_DELALLOC_WAIT); | ||
657 | break; | ||
658 | case FLUSH_DELAYED_REFS_NR: | ||
659 | case FLUSH_DELAYED_REFS: | ||
660 | trans = btrfs_join_transaction(root); | ||
661 | if (IS_ERR(trans)) { | ||
662 | ret = PTR_ERR(trans); | ||
663 | break; | ||
664 | } | ||
665 | if (state == FLUSH_DELAYED_REFS_NR) | ||
666 | nr = calc_reclaim_items_nr(fs_info, num_bytes); | ||
667 | else | ||
668 | nr = 0; | ||
669 | btrfs_run_delayed_refs(trans, nr); | ||
670 | btrfs_end_transaction(trans); | ||
671 | break; | ||
672 | case ALLOC_CHUNK: | ||
673 | case ALLOC_CHUNK_FORCE: | ||
674 | trans = btrfs_join_transaction(root); | ||
675 | if (IS_ERR(trans)) { | ||
676 | ret = PTR_ERR(trans); | ||
677 | break; | ||
678 | } | ||
679 | ret = btrfs_chunk_alloc(trans, | ||
680 | btrfs_metadata_alloc_profile(fs_info), | ||
681 | (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : | ||
682 | CHUNK_ALLOC_FORCE); | ||
683 | btrfs_end_transaction(trans); | ||
684 | if (ret > 0 || ret == -ENOSPC) | ||
685 | ret = 0; | ||
686 | break; | ||
687 | case COMMIT_TRANS: | ||
688 | /* | ||
689 | * If we have pending delayed iputs then we could free up a | ||
690 | * bunch of pinned space, so make sure we run the iputs before | ||
691 | * we do our pinned bytes check below. | ||
692 | */ | ||
693 | btrfs_run_delayed_iputs(fs_info); | ||
694 | btrfs_wait_on_delayed_iputs(fs_info); | ||
695 | |||
696 | ret = may_commit_transaction(fs_info, space_info); | ||
697 | break; | ||
698 | default: | ||
699 | ret = -ENOSPC; | ||
700 | break; | ||
701 | } | ||
702 | |||
703 | trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, | ||
704 | ret); | ||
705 | return; | ||
706 | } | ||
707 | |||
708 | static inline u64 | ||
709 | btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, | ||
710 | struct btrfs_space_info *space_info, | ||
711 | bool system_chunk) | ||
712 | { | ||
713 | struct reserve_ticket *ticket; | ||
714 | u64 used; | ||
715 | u64 expected; | ||
716 | u64 to_reclaim = 0; | ||
717 | |||
718 | list_for_each_entry(ticket, &space_info->tickets, list) | ||
719 | to_reclaim += ticket->bytes; | ||
720 | list_for_each_entry(ticket, &space_info->priority_tickets, list) | ||
721 | to_reclaim += ticket->bytes; | ||
722 | if (to_reclaim) | ||
723 | return to_reclaim; | ||
724 | |||
725 | to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); | ||
726 | if (can_overcommit(fs_info, space_info, to_reclaim, | ||
727 | BTRFS_RESERVE_FLUSH_ALL, system_chunk)) | ||
728 | return 0; | ||
729 | |||
730 | used = btrfs_space_info_used(space_info, true); | ||
731 | |||
732 | if (can_overcommit(fs_info, space_info, SZ_1M, | ||
733 | BTRFS_RESERVE_FLUSH_ALL, system_chunk)) | ||
734 | expected = div_factor_fine(space_info->total_bytes, 95); | ||
735 | else | ||
736 | expected = div_factor_fine(space_info->total_bytes, 90); | ||
737 | |||
738 | if (used > expected) | ||
739 | to_reclaim = used - expected; | ||
740 | else | ||
741 | to_reclaim = 0; | ||
742 | to_reclaim = min(to_reclaim, space_info->bytes_may_use + | ||
743 | space_info->bytes_reserved); | ||
744 | return to_reclaim; | ||
745 | } | ||
746 | |||
747 | static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, | ||
748 | struct btrfs_space_info *space_info, | ||
749 | u64 used, bool system_chunk) | ||
750 | { | ||
751 | u64 thresh = div_factor_fine(space_info->total_bytes, 98); | ||
752 | |||
753 | /* If we're just plain full then async reclaim just slows us down. */ | ||
754 | if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) | ||
755 | return 0; | ||
756 | |||
757 | if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, | ||
758 | system_chunk)) | ||
759 | return 0; | ||
760 | |||
761 | return (used >= thresh && !btrfs_fs_closing(fs_info) && | ||
762 | !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); | ||
763 | } | ||
764 | |||
765 | static bool wake_all_tickets(struct list_head *head) | ||
766 | { | ||
767 | struct reserve_ticket *ticket; | ||
768 | |||
769 | while (!list_empty(head)) { | ||
770 | ticket = list_first_entry(head, struct reserve_ticket, list); | ||
771 | list_del_init(&ticket->list); | ||
772 | ticket->error = -ENOSPC; | ||
773 | wake_up(&ticket->wait); | ||
774 | if (ticket->bytes != ticket->orig_bytes) | ||
775 | return true; | ||
776 | } | ||
777 | return false; | ||
778 | } | ||
779 | |||
780 | /* | ||
781 | * This is for normal flushers, we can wait all goddamned day if we want to. We | ||
782 | * will loop and continuously try to flush as long as we are making progress. | ||
783 | * We count progress as clearing off tickets each time we have to loop. | ||
784 | */ | ||
785 | static void btrfs_async_reclaim_metadata_space(struct work_struct *work) | ||
786 | { | ||
787 | struct btrfs_fs_info *fs_info; | ||
788 | struct btrfs_space_info *space_info; | ||
789 | u64 to_reclaim; | ||
790 | int flush_state; | ||
791 | int commit_cycles = 0; | ||
792 | u64 last_tickets_id; | ||
793 | |||
794 | fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); | ||
795 | space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); | ||
796 | |||
797 | spin_lock(&space_info->lock); | ||
798 | to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, | ||
799 | false); | ||
800 | if (!to_reclaim) { | ||
801 | space_info->flush = 0; | ||
802 | spin_unlock(&space_info->lock); | ||
803 | return; | ||
804 | } | ||
805 | last_tickets_id = space_info->tickets_id; | ||
806 | spin_unlock(&space_info->lock); | ||
807 | |||
808 | flush_state = FLUSH_DELAYED_ITEMS_NR; | ||
809 | do { | ||
810 | flush_space(fs_info, space_info, to_reclaim, flush_state); | ||
811 | spin_lock(&space_info->lock); | ||
812 | if (list_empty(&space_info->tickets)) { | ||
813 | space_info->flush = 0; | ||
814 | spin_unlock(&space_info->lock); | ||
815 | return; | ||
816 | } | ||
817 | to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, | ||
818 | space_info, | ||
819 | false); | ||
820 | if (last_tickets_id == space_info->tickets_id) { | ||
821 | flush_state++; | ||
822 | } else { | ||
823 | last_tickets_id = space_info->tickets_id; | ||
824 | flush_state = FLUSH_DELAYED_ITEMS_NR; | ||
825 | if (commit_cycles) | ||
826 | commit_cycles--; | ||
827 | } | ||
828 | |||
829 | /* | ||
830 | * We don't want to force a chunk allocation until we've tried | ||
831 | * pretty hard to reclaim space. Think of the case where we | ||
832 | * freed up a bunch of space and so have a lot of pinned space | ||
833 | * to reclaim. We would rather use that than possibly create a | ||
834 | * underutilized metadata chunk. So if this is our first run | ||
835 | * through the flushing state machine skip ALLOC_CHUNK_FORCE and | ||
836 | * commit the transaction. If nothing has changed the next go | ||
837 | * around then we can force a chunk allocation. | ||
838 | */ | ||
839 | if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) | ||
840 | flush_state++; | ||
841 | |||
842 | if (flush_state > COMMIT_TRANS) { | ||
843 | commit_cycles++; | ||
844 | if (commit_cycles > 2) { | ||
845 | if (wake_all_tickets(&space_info->tickets)) { | ||
846 | flush_state = FLUSH_DELAYED_ITEMS_NR; | ||
847 | commit_cycles--; | ||
848 | } else { | ||
849 | space_info->flush = 0; | ||
850 | } | ||
851 | } else { | ||
852 | flush_state = FLUSH_DELAYED_ITEMS_NR; | ||
853 | } | ||
854 | } | ||
855 | spin_unlock(&space_info->lock); | ||
856 | } while (flush_state <= COMMIT_TRANS); | ||
857 | } | ||
858 | |||
859 | void btrfs_init_async_reclaim_work(struct work_struct *work) | ||
860 | { | ||
861 | INIT_WORK(work, btrfs_async_reclaim_metadata_space); | ||
862 | } | ||
863 | |||
864 | static const enum btrfs_flush_state priority_flush_states[] = { | ||
865 | FLUSH_DELAYED_ITEMS_NR, | ||
866 | FLUSH_DELAYED_ITEMS, | ||
867 | ALLOC_CHUNK, | ||
868 | }; | ||
869 | |||
870 | static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, | ||
871 | struct btrfs_space_info *space_info, | ||
872 | struct reserve_ticket *ticket) | ||
873 | { | ||
874 | u64 to_reclaim; | ||
875 | int flush_state; | ||
876 | |||
877 | spin_lock(&space_info->lock); | ||
878 | to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, | ||
879 | false); | ||
880 | if (!to_reclaim) { | ||
881 | spin_unlock(&space_info->lock); | ||
882 | return; | ||
883 | } | ||
884 | spin_unlock(&space_info->lock); | ||
885 | |||
886 | flush_state = 0; | ||
887 | do { | ||
888 | flush_space(fs_info, space_info, to_reclaim, | ||
889 | priority_flush_states[flush_state]); | ||
890 | flush_state++; | ||
891 | spin_lock(&space_info->lock); | ||
892 | if (ticket->bytes == 0) { | ||
893 | spin_unlock(&space_info->lock); | ||
894 | return; | ||
895 | } | ||
896 | spin_unlock(&space_info->lock); | ||
897 | } while (flush_state < ARRAY_SIZE(priority_flush_states)); | ||
898 | } | ||
899 | |||
900 | static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, | ||
901 | struct btrfs_space_info *space_info, | ||
902 | struct reserve_ticket *ticket) | ||
903 | |||
904 | { | ||
905 | DEFINE_WAIT(wait); | ||
906 | u64 reclaim_bytes = 0; | ||
907 | int ret = 0; | ||
908 | |||
909 | spin_lock(&space_info->lock); | ||
910 | while (ticket->bytes > 0 && ticket->error == 0) { | ||
911 | ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); | ||
912 | if (ret) { | ||
913 | ret = -EINTR; | ||
914 | break; | ||
915 | } | ||
916 | spin_unlock(&space_info->lock); | ||
917 | |||
918 | schedule(); | ||
919 | |||
920 | finish_wait(&ticket->wait, &wait); | ||
921 | spin_lock(&space_info->lock); | ||
922 | } | ||
923 | if (!ret) | ||
924 | ret = ticket->error; | ||
925 | if (!list_empty(&ticket->list)) | ||
926 | list_del_init(&ticket->list); | ||
927 | if (ticket->bytes && ticket->bytes < ticket->orig_bytes) | ||
928 | reclaim_bytes = ticket->orig_bytes - ticket->bytes; | ||
929 | spin_unlock(&space_info->lock); | ||
930 | |||
931 | if (reclaim_bytes) | ||
932 | btrfs_space_info_add_old_bytes(fs_info, space_info, | ||
933 | reclaim_bytes); | ||
934 | return ret; | ||
935 | } | ||
936 | |||
937 | /** | ||
938 | * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space | ||
939 | * @root - the root we're allocating for | ||
940 | * @space_info - the space info we want to allocate from | ||
941 | * @orig_bytes - the number of bytes we want | ||
942 | * @flush - whether or not we can flush to make our reservation | ||
943 | * | ||
944 | * This will reserve orig_bytes number of bytes from the space info associated | ||
945 | * with the block_rsv. If there is not enough space it will make an attempt to | ||
946 | * flush out space to make room. It will do this by flushing delalloc if | ||
947 | * possible or committing the transaction. If flush is 0 then no attempts to | ||
948 | * regain reservations will be made and this will fail if there is not enough | ||
949 | * space already. | ||
950 | */ | ||
951 | static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, | ||
952 | struct btrfs_space_info *space_info, | ||
953 | u64 orig_bytes, | ||
954 | enum btrfs_reserve_flush_enum flush, | ||
955 | bool system_chunk) | ||
956 | { | ||
957 | struct reserve_ticket ticket; | ||
958 | u64 used; | ||
959 | u64 reclaim_bytes = 0; | ||
960 | int ret = 0; | ||
961 | |||
962 | ASSERT(orig_bytes); | ||
963 | ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); | ||
964 | |||
965 | spin_lock(&space_info->lock); | ||
966 | ret = -ENOSPC; | ||
967 | used = btrfs_space_info_used(space_info, true); | ||
968 | |||
969 | /* | ||
970 | * Carry on if we have enough space (short-circuit) OR call | ||
971 | * can_overcommit() to ensure we can overcommit to continue. | ||
972 | */ | ||
973 | if ((used + orig_bytes <= space_info->total_bytes) || | ||
974 | can_overcommit(fs_info, space_info, orig_bytes, flush, | ||
975 | system_chunk)) { | ||
976 | btrfs_space_info_update_bytes_may_use(fs_info, space_info, | ||
977 | orig_bytes); | ||
978 | trace_btrfs_space_reservation(fs_info, "space_info", | ||
979 | space_info->flags, orig_bytes, 1); | ||
980 | ret = 0; | ||
981 | } | ||
982 | |||
983 | /* | ||
984 | * If we couldn't make a reservation then setup our reservation ticket | ||
985 | * and kick the async worker if it's not already running. | ||
986 | * | ||
987 | * If we are a priority flusher then we just need to add our ticket to | ||
988 | * the list and we will do our own flushing further down. | ||
989 | */ | ||
990 | if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { | ||
991 | ticket.orig_bytes = orig_bytes; | ||
992 | ticket.bytes = orig_bytes; | ||
993 | ticket.error = 0; | ||
994 | init_waitqueue_head(&ticket.wait); | ||
995 | if (flush == BTRFS_RESERVE_FLUSH_ALL) { | ||
996 | list_add_tail(&ticket.list, &space_info->tickets); | ||
997 | if (!space_info->flush) { | ||
998 | space_info->flush = 1; | ||
999 | trace_btrfs_trigger_flush(fs_info, | ||
1000 | space_info->flags, | ||
1001 | orig_bytes, flush, | ||
1002 | "enospc"); | ||
1003 | queue_work(system_unbound_wq, | ||
1004 | &fs_info->async_reclaim_work); | ||
1005 | } | ||
1006 | } else { | ||
1007 | list_add_tail(&ticket.list, | ||
1008 | &space_info->priority_tickets); | ||
1009 | } | ||
1010 | } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { | ||
1011 | used += orig_bytes; | ||
1012 | /* | ||
1013 | * We will do the space reservation dance during log replay, | ||
1014 | * which means we won't have fs_info->fs_root set, so don't do | ||
1015 | * the async reclaim as we will panic. | ||
1016 | */ | ||
1017 | if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && | ||
1018 | need_do_async_reclaim(fs_info, space_info, | ||
1019 | used, system_chunk) && | ||
1020 | !work_busy(&fs_info->async_reclaim_work)) { | ||
1021 | trace_btrfs_trigger_flush(fs_info, space_info->flags, | ||
1022 | orig_bytes, flush, "preempt"); | ||
1023 | queue_work(system_unbound_wq, | ||
1024 | &fs_info->async_reclaim_work); | ||
1025 | } | ||
1026 | } | ||
1027 | spin_unlock(&space_info->lock); | ||
1028 | if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) | ||
1029 | return ret; | ||
1030 | |||
1031 | if (flush == BTRFS_RESERVE_FLUSH_ALL) | ||
1032 | return wait_reserve_ticket(fs_info, space_info, &ticket); | ||
1033 | |||
1034 | ret = 0; | ||
1035 | priority_reclaim_metadata_space(fs_info, space_info, &ticket); | ||
1036 | spin_lock(&space_info->lock); | ||
1037 | if (ticket.bytes) { | ||
1038 | if (ticket.bytes < orig_bytes) | ||
1039 | reclaim_bytes = orig_bytes - ticket.bytes; | ||
1040 | list_del_init(&ticket.list); | ||
1041 | ret = -ENOSPC; | ||
1042 | } | ||
1043 | spin_unlock(&space_info->lock); | ||
1044 | |||
1045 | if (reclaim_bytes) | ||
1046 | btrfs_space_info_add_old_bytes(fs_info, space_info, | ||
1047 | reclaim_bytes); | ||
1048 | ASSERT(list_empty(&ticket.list)); | ||
1049 | return ret; | ||
1050 | } | ||
1051 | |||
1052 | /** | ||
1053 | * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space | ||
1054 | * @root - the root we're allocating for | ||
1055 | * @block_rsv - the block_rsv we're allocating for | ||
1056 | * @orig_bytes - the number of bytes we want | ||
1057 | * @flush - whether or not we can flush to make our reservation | ||
1058 | * | ||
1059 | * This will reserve orig_bytes number of bytes from the space info associated | ||
1060 | * with the block_rsv. If there is not enough space it will make an attempt to | ||
1061 | * flush out space to make room. It will do this by flushing delalloc if | ||
1062 | * possible or committing the transaction. If flush is 0 then no attempts to | ||
1063 | * regain reservations will be made and this will fail if there is not enough | ||
1064 | * space already. | ||
1065 | */ | ||
1066 | int btrfs_reserve_metadata_bytes(struct btrfs_root *root, | ||
1067 | struct btrfs_block_rsv *block_rsv, | ||
1068 | u64 orig_bytes, | ||
1069 | enum btrfs_reserve_flush_enum flush) | ||
1070 | { | ||
1071 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
1072 | struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; | ||
1073 | int ret; | ||
1074 | bool system_chunk = (root == fs_info->chunk_root); | ||
1075 | |||
1076 | ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, | ||
1077 | orig_bytes, flush, system_chunk); | ||
1078 | if (ret == -ENOSPC && | ||
1079 | unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { | ||
1080 | if (block_rsv != global_rsv && | ||
1081 | !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) | ||
1082 | ret = 0; | ||
1083 | } | ||
1084 | if (ret == -ENOSPC) { | ||
1085 | trace_btrfs_space_reservation(fs_info, "space_info:enospc", | ||
1086 | block_rsv->space_info->flags, | ||
1087 | orig_bytes, 1); | ||
1088 | |||
1089 | if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) | ||
1090 | btrfs_dump_space_info(fs_info, block_rsv->space_info, | ||
1091 | orig_bytes, 0); | ||
1092 | } | ||
1093 | return ret; | ||
1094 | } | ||
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h new file mode 100644 index 000000000000..c2b54b8e1a14 --- /dev/null +++ b/fs/btrfs/space-info.h | |||
@@ -0,0 +1,133 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | |||
3 | #ifndef BTRFS_SPACE_INFO_H | ||
4 | #define BTRFS_SPACE_INFO_H | ||
5 | |||
6 | struct btrfs_space_info { | ||
7 | spinlock_t lock; | ||
8 | |||
9 | u64 total_bytes; /* total bytes in the space, | ||
10 | this doesn't take mirrors into account */ | ||
11 | u64 bytes_used; /* total bytes used, | ||
12 | this doesn't take mirrors into account */ | ||
13 | u64 bytes_pinned; /* total bytes pinned, will be freed when the | ||
14 | transaction finishes */ | ||
15 | u64 bytes_reserved; /* total bytes the allocator has reserved for | ||
16 | current allocations */ | ||
17 | u64 bytes_may_use; /* number of bytes that may be used for | ||
18 | delalloc/allocations */ | ||
19 | u64 bytes_readonly; /* total bytes that are read only */ | ||
20 | |||
21 | u64 max_extent_size; /* This will hold the maximum extent size of | ||
22 | the space info if we had an ENOSPC in the | ||
23 | allocator. */ | ||
24 | |||
25 | unsigned int full:1; /* indicates that we cannot allocate any more | ||
26 | chunks for this space */ | ||
27 | unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ | ||
28 | |||
29 | unsigned int flush:1; /* set if we are trying to make space */ | ||
30 | |||
31 | unsigned int force_alloc; /* set if we need to force a chunk | ||
32 | alloc for this space */ | ||
33 | |||
34 | u64 disk_used; /* total bytes used on disk */ | ||
35 | u64 disk_total; /* total bytes on disk, takes mirrors into | ||
36 | account */ | ||
37 | |||
38 | u64 flags; | ||
39 | |||
40 | /* | ||
41 | * bytes_pinned is kept in line with what is actually pinned, as in | ||
42 | * we've called update_block_group and dropped the bytes_used counter | ||
43 | * and increased the bytes_pinned counter. However this means that | ||
44 | * bytes_pinned does not reflect the bytes that will be pinned once the | ||
45 | * delayed refs are flushed, so this counter is inc'ed every time we | ||
46 | * call btrfs_free_extent so it is a realtime count of what will be | ||
47 | * freed once the transaction is committed. It will be zeroed every | ||
48 | * time the transaction commits. | ||
49 | */ | ||
50 | struct percpu_counter total_bytes_pinned; | ||
51 | |||
52 | struct list_head list; | ||
53 | /* Protected by the spinlock 'lock'. */ | ||
54 | struct list_head ro_bgs; | ||
55 | struct list_head priority_tickets; | ||
56 | struct list_head tickets; | ||
57 | /* | ||
58 | * tickets_id just indicates the next ticket will be handled, so note | ||
59 | * it's not stored per ticket. | ||
60 | */ | ||
61 | u64 tickets_id; | ||
62 | |||
63 | struct rw_semaphore groups_sem; | ||
64 | /* for block groups in our same type */ | ||
65 | struct list_head block_groups[BTRFS_NR_RAID_TYPES]; | ||
66 | wait_queue_head_t wait; | ||
67 | |||
68 | struct kobject kobj; | ||
69 | struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; | ||
70 | }; | ||
71 | |||
72 | struct reserve_ticket { | ||
73 | u64 orig_bytes; | ||
74 | u64 bytes; | ||
75 | int error; | ||
76 | struct list_head list; | ||
77 | wait_queue_head_t wait; | ||
78 | }; | ||
79 | |||
80 | static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) | ||
81 | { | ||
82 | return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && | ||
83 | (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); | ||
84 | } | ||
85 | |||
86 | /* | ||
87 | * | ||
88 | * Declare a helper function to detect underflow of various space info members | ||
89 | */ | ||
90 | #define DECLARE_SPACE_INFO_UPDATE(name) \ | ||
91 | static inline void \ | ||
92 | btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ | ||
93 | struct btrfs_space_info *sinfo, \ | ||
94 | s64 bytes) \ | ||
95 | { \ | ||
96 | lockdep_assert_held(&sinfo->lock); \ | ||
97 | trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ | ||
98 | if (bytes < 0 && sinfo->name < -bytes) { \ | ||
99 | WARN_ON(1); \ | ||
100 | sinfo->name = 0; \ | ||
101 | return; \ | ||
102 | } \ | ||
103 | sinfo->name += bytes; \ | ||
104 | } | ||
105 | |||
106 | DECLARE_SPACE_INFO_UPDATE(bytes_may_use); | ||
107 | DECLARE_SPACE_INFO_UPDATE(bytes_pinned); | ||
108 | |||
109 | void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, | ||
110 | struct btrfs_space_info *space_info, | ||
111 | u64 num_bytes); | ||
112 | void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, | ||
113 | struct btrfs_space_info *space_info, | ||
114 | u64 num_bytes); | ||
115 | int btrfs_init_space_info(struct btrfs_fs_info *fs_info); | ||
116 | void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, | ||
117 | u64 total_bytes, u64 bytes_used, | ||
118 | u64 bytes_readonly, | ||
119 | struct btrfs_space_info **space_info); | ||
120 | struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, | ||
121 | u64 flags); | ||
122 | u64 btrfs_space_info_used(struct btrfs_space_info *s_info, | ||
123 | bool may_use_included); | ||
124 | void btrfs_clear_space_info_full(struct btrfs_fs_info *info); | ||
125 | void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, | ||
126 | struct btrfs_space_info *info, u64 bytes, | ||
127 | int dump_block_groups); | ||
128 | int btrfs_reserve_metadata_bytes(struct btrfs_root *root, | ||
129 | struct btrfs_block_rsv *block_rsv, | ||
130 | u64 orig_bytes, | ||
131 | enum btrfs_reserve_flush_enum flush); | ||
132 | |||
133 | #endif /* BTRFS_SPACE_INFO_H */ | ||
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0645ec428b4f..78de9d5d80c6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c | |||
@@ -42,6 +42,7 @@ | |||
42 | #include "dev-replace.h" | 42 | #include "dev-replace.h" |
43 | #include "free-space-cache.h" | 43 | #include "free-space-cache.h" |
44 | #include "backref.h" | 44 | #include "backref.h" |
45 | #include "space-info.h" | ||
45 | #include "tests/btrfs-tests.h" | 46 | #include "tests/btrfs-tests.h" |
46 | 47 | ||
47 | #include "qgroup.h" | 48 | #include "qgroup.h" |
@@ -1553,6 +1554,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, | |||
1553 | } else { | 1554 | } else { |
1554 | snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); | 1555 | snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); |
1555 | btrfs_sb(s)->bdev_holder = fs_type; | 1556 | btrfs_sb(s)->bdev_holder = fs_type; |
1557 | if (!strstr(crc32c_impl(), "generic")) | ||
1558 | set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); | ||
1556 | error = btrfs_fill_super(s, fs_devices, data); | 1559 | error = btrfs_fill_super(s, fs_devices, data); |
1557 | } | 1560 | } |
1558 | if (!error) | 1561 | if (!error) |
@@ -1601,14 +1604,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, | |||
1601 | { | 1604 | { |
1602 | struct vfsmount *mnt_root; | 1605 | struct vfsmount *mnt_root; |
1603 | struct dentry *root; | 1606 | struct dentry *root; |
1604 | fmode_t mode = FMODE_READ; | ||
1605 | char *subvol_name = NULL; | 1607 | char *subvol_name = NULL; |
1606 | u64 subvol_objectid = 0; | 1608 | u64 subvol_objectid = 0; |
1607 | int error = 0; | 1609 | int error = 0; |
1608 | 1610 | ||
1609 | if (!(flags & SB_RDONLY)) | ||
1610 | mode |= FMODE_WRITE; | ||
1611 | |||
1612 | error = btrfs_parse_subvol_options(data, &subvol_name, | 1611 | error = btrfs_parse_subvol_options(data, &subvol_name, |
1613 | &subvol_objectid); | 1612 | &subvol_objectid); |
1614 | if (error) { | 1613 | if (error) { |
@@ -1904,8 +1903,9 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, | |||
1904 | u64 type; | 1903 | u64 type; |
1905 | u64 avail_space; | 1904 | u64 avail_space; |
1906 | u64 min_stripe_size; | 1905 | u64 min_stripe_size; |
1907 | int min_stripes = 1, num_stripes = 1; | 1906 | int min_stripes, num_stripes = 1; |
1908 | int i = 0, nr_devices; | 1907 | int i = 0, nr_devices; |
1908 | const struct btrfs_raid_attr *rattr; | ||
1909 | 1909 | ||
1910 | /* | 1910 | /* |
1911 | * We aren't under the device list lock, so this is racy-ish, but good | 1911 | * We aren't under the device list lock, so this is racy-ish, but good |
@@ -1929,21 +1929,18 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, | |||
1929 | 1929 | ||
1930 | /* calc min stripe number for data space allocation */ | 1930 | /* calc min stripe number for data space allocation */ |
1931 | type = btrfs_data_alloc_profile(fs_info); | 1931 | type = btrfs_data_alloc_profile(fs_info); |
1932 | if (type & BTRFS_BLOCK_GROUP_RAID0) { | 1932 | rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)]; |
1933 | min_stripes = 2; | 1933 | min_stripes = rattr->devs_min; |
1934 | |||
1935 | if (type & BTRFS_BLOCK_GROUP_RAID0) | ||
1934 | num_stripes = nr_devices; | 1936 | num_stripes = nr_devices; |
1935 | } else if (type & BTRFS_BLOCK_GROUP_RAID1) { | 1937 | else if (type & BTRFS_BLOCK_GROUP_RAID1) |
1936 | min_stripes = 2; | ||
1937 | num_stripes = 2; | 1938 | num_stripes = 2; |
1938 | } else if (type & BTRFS_BLOCK_GROUP_RAID10) { | 1939 | else if (type & BTRFS_BLOCK_GROUP_RAID10) |
1939 | min_stripes = 4; | ||
1940 | num_stripes = 4; | 1940 | num_stripes = 4; |
1941 | } | ||
1942 | 1941 | ||
1943 | if (type & BTRFS_BLOCK_GROUP_DUP) | 1942 | /* Adjust for more than 1 stripe per device */ |
1944 | min_stripe_size = 2 * BTRFS_STRIPE_LEN; | 1943 | min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN; |
1945 | else | ||
1946 | min_stripe_size = BTRFS_STRIPE_LEN; | ||
1947 | 1944 | ||
1948 | rcu_read_lock(); | 1945 | rcu_read_lock(); |
1949 | list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { | 1946 | list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { |
@@ -2466,3 +2463,4 @@ late_initcall(init_btrfs_fs); | |||
2466 | module_exit(exit_btrfs_fs) | 2463 | module_exit(exit_btrfs_fs) |
2467 | 2464 | ||
2468 | MODULE_LICENSE("GPL"); | 2465 | MODULE_LICENSE("GPL"); |
2466 | MODULE_SOFTDEP("pre: crc32c"); | ||
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c1dfc97893ba..9539f8143b7a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include "transaction.h" | 16 | #include "transaction.h" |
17 | #include "sysfs.h" | 17 | #include "sysfs.h" |
18 | #include "volumes.h" | 18 | #include "volumes.h" |
19 | #include "space-info.h" | ||
19 | 20 | ||
20 | static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); | 21 | static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); |
21 | static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); | 22 | static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); |
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 7bf4d5734dbe..1bf6b5a79191 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include "btrfs-tests.h" | 10 | #include "btrfs-tests.h" |
11 | #include "../ctree.h" | 11 | #include "../ctree.h" |
12 | #include "../extent_io.h" | 12 | #include "../extent_io.h" |
13 | #include "../btrfs_inode.h" | ||
13 | 14 | ||
14 | #define PROCESS_UNLOCK (1 << 0) | 15 | #define PROCESS_UNLOCK (1 << 0) |
15 | #define PROCESS_RELEASE (1 << 1) | 16 | #define PROCESS_RELEASE (1 << 1) |
@@ -58,7 +59,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, | |||
58 | static int test_find_delalloc(u32 sectorsize) | 59 | static int test_find_delalloc(u32 sectorsize) |
59 | { | 60 | { |
60 | struct inode *inode; | 61 | struct inode *inode; |
61 | struct extent_io_tree tmp; | 62 | struct extent_io_tree *tmp; |
62 | struct page *page; | 63 | struct page *page; |
63 | struct page *locked_page = NULL; | 64 | struct page *locked_page = NULL; |
64 | unsigned long index = 0; | 65 | unsigned long index = 0; |
@@ -76,12 +77,13 @@ static int test_find_delalloc(u32 sectorsize) | |||
76 | test_std_err(TEST_ALLOC_INODE); | 77 | test_std_err(TEST_ALLOC_INODE); |
77 | return -ENOMEM; | 78 | return -ENOMEM; |
78 | } | 79 | } |
80 | tmp = &BTRFS_I(inode)->io_tree; | ||
79 | 81 | ||
80 | /* | 82 | /* |
81 | * Passing NULL as we don't have fs_info but tracepoints are not used | 83 | * Passing NULL as we don't have fs_info but tracepoints are not used |
82 | * at this point | 84 | * at this point |
83 | */ | 85 | */ |
84 | extent_io_tree_init(NULL, &tmp, IO_TREE_SELFTEST, NULL); | 86 | extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL); |
85 | 87 | ||
86 | /* | 88 | /* |
87 | * First go through and create and mark all of our pages dirty, we pin | 89 | * First go through and create and mark all of our pages dirty, we pin |
@@ -108,10 +110,10 @@ static int test_find_delalloc(u32 sectorsize) | |||
108 | * |--- delalloc ---| | 110 | * |--- delalloc ---| |
109 | * |--- search ---| | 111 | * |--- search ---| |
110 | */ | 112 | */ |
111 | set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); | 113 | set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL); |
112 | start = 0; | 114 | start = 0; |
113 | end = 0; | 115 | end = 0; |
114 | found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, | 116 | found = find_lock_delalloc_range(inode, locked_page, &start, |
115 | &end); | 117 | &end); |
116 | if (!found) { | 118 | if (!found) { |
117 | test_err("should have found at least one delalloc"); | 119 | test_err("should have found at least one delalloc"); |
@@ -122,7 +124,7 @@ static int test_find_delalloc(u32 sectorsize) | |||
122 | sectorsize - 1, start, end); | 124 | sectorsize - 1, start, end); |
123 | goto out_bits; | 125 | goto out_bits; |
124 | } | 126 | } |
125 | unlock_extent(&tmp, start, end); | 127 | unlock_extent(tmp, start, end); |
126 | unlock_page(locked_page); | 128 | unlock_page(locked_page); |
127 | put_page(locked_page); | 129 | put_page(locked_page); |
128 | 130 | ||
@@ -139,10 +141,10 @@ static int test_find_delalloc(u32 sectorsize) | |||
139 | test_err("couldn't find the locked page"); | 141 | test_err("couldn't find the locked page"); |
140 | goto out_bits; | 142 | goto out_bits; |
141 | } | 143 | } |
142 | set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); | 144 | set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL); |
143 | start = test_start; | 145 | start = test_start; |
144 | end = 0; | 146 | end = 0; |
145 | found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, | 147 | found = find_lock_delalloc_range(inode, locked_page, &start, |
146 | &end); | 148 | &end); |
147 | if (!found) { | 149 | if (!found) { |
148 | test_err("couldn't find delalloc in our range"); | 150 | test_err("couldn't find delalloc in our range"); |
@@ -158,7 +160,7 @@ static int test_find_delalloc(u32 sectorsize) | |||
158 | test_err("there were unlocked pages in the range"); | 160 | test_err("there were unlocked pages in the range"); |
159 | goto out_bits; | 161 | goto out_bits; |
160 | } | 162 | } |
161 | unlock_extent(&tmp, start, end); | 163 | unlock_extent(tmp, start, end); |
162 | /* locked_page was unlocked above */ | 164 | /* locked_page was unlocked above */ |
163 | put_page(locked_page); | 165 | put_page(locked_page); |
164 | 166 | ||
@@ -176,7 +178,7 @@ static int test_find_delalloc(u32 sectorsize) | |||
176 | } | 178 | } |
177 | start = test_start; | 179 | start = test_start; |
178 | end = 0; | 180 | end = 0; |
179 | found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, | 181 | found = find_lock_delalloc_range(inode, locked_page, &start, |
180 | &end); | 182 | &end); |
181 | if (found) { | 183 | if (found) { |
182 | test_err("found range when we shouldn't have"); | 184 | test_err("found range when we shouldn't have"); |
@@ -194,10 +196,10 @@ static int test_find_delalloc(u32 sectorsize) | |||
194 | * | 196 | * |
195 | * We are re-using our test_start from above since it works out well. | 197 | * We are re-using our test_start from above since it works out well. |
196 | */ | 198 | */ |
197 | set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); | 199 | set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL); |
198 | start = test_start; | 200 | start = test_start; |
199 | end = 0; | 201 | end = 0; |
200 | found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, | 202 | found = find_lock_delalloc_range(inode, locked_page, &start, |
201 | &end); | 203 | &end); |
202 | if (!found) { | 204 | if (!found) { |
203 | test_err("didn't find our range"); | 205 | test_err("didn't find our range"); |
@@ -213,7 +215,7 @@ static int test_find_delalloc(u32 sectorsize) | |||
213 | test_err("pages in range were not all locked"); | 215 | test_err("pages in range were not all locked"); |
214 | goto out_bits; | 216 | goto out_bits; |
215 | } | 217 | } |
216 | unlock_extent(&tmp, start, end); | 218 | unlock_extent(tmp, start, end); |
217 | 219 | ||
218 | /* | 220 | /* |
219 | * Now to test where we run into a page that is no longer dirty in the | 221 | * Now to test where we run into a page that is no longer dirty in the |
@@ -238,7 +240,7 @@ static int test_find_delalloc(u32 sectorsize) | |||
238 | * this changes at any point in the future we will need to fix this | 240 | * this changes at any point in the future we will need to fix this |
239 | * tests expected behavior. | 241 | * tests expected behavior. |
240 | */ | 242 | */ |
241 | found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, | 243 | found = find_lock_delalloc_range(inode, locked_page, &start, |
242 | &end); | 244 | &end); |
243 | if (!found) { | 245 | if (!found) { |
244 | test_err("didn't find our range"); | 246 | test_err("didn't find our range"); |
@@ -256,7 +258,7 @@ static int test_find_delalloc(u32 sectorsize) | |||
256 | } | 258 | } |
257 | ret = 0; | 259 | ret = 0; |
258 | out_bits: | 260 | out_bits: |
259 | clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1); | 261 | clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); |
260 | out: | 262 | out: |
261 | if (locked_page) | 263 | if (locked_page) |
262 | put_page(locked_page); | 264 | put_page(locked_page); |
@@ -432,6 +434,89 @@ out: | |||
432 | return ret; | 434 | return ret; |
433 | } | 435 | } |
434 | 436 | ||
437 | static int test_find_first_clear_extent_bit(void) | ||
438 | { | ||
439 | struct extent_io_tree tree; | ||
440 | u64 start, end; | ||
441 | |||
442 | test_msg("running find_first_clear_extent_bit test"); | ||
443 | extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); | ||
444 | |||
445 | /* | ||
446 | * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between | ||
447 | * 4M-32M | ||
448 | */ | ||
449 | set_extent_bits(&tree, SZ_1M, SZ_4M - 1, | ||
450 | CHUNK_TRIMMED | CHUNK_ALLOCATED); | ||
451 | |||
452 | find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, | ||
453 | CHUNK_TRIMMED | CHUNK_ALLOCATED); | ||
454 | |||
455 | if (start != 0 || end != SZ_1M -1) | ||
456 | test_err("error finding beginning range: start %llu end %llu", | ||
457 | start, end); | ||
458 | |||
459 | /* Now add 32M-64M so that we have a hole between 4M-32M */ | ||
460 | set_extent_bits(&tree, SZ_32M, SZ_64M - 1, | ||
461 | CHUNK_TRIMMED | CHUNK_ALLOCATED); | ||
462 | |||
463 | /* | ||
464 | * Request first hole starting at 12M, we should get 4M-32M | ||
465 | */ | ||
466 | find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, | ||
467 | CHUNK_TRIMMED | CHUNK_ALLOCATED); | ||
468 | |||
469 | if (start != SZ_4M || end != SZ_32M - 1) | ||
470 | test_err("error finding trimmed range: start %llu end %llu", | ||
471 | start, end); | ||
472 | |||
473 | /* | ||
474 | * Search in the middle of allocated range, should get the next one | ||
475 | * available, which happens to be unallocated -> 4M-32M | ||
476 | */ | ||
477 | find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, | ||
478 | CHUNK_TRIMMED | CHUNK_ALLOCATED); | ||
479 | |||
480 | if (start != SZ_4M || end != SZ_32M -1) | ||
481 | test_err("error finding next unalloc range: start %llu end %llu", | ||
482 | start, end); | ||
483 | |||
484 | /* | ||
485 | * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag | ||
486 | * being unset in this range, we should get the entry in range 64M-72M | ||
487 | */ | ||
488 | set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED); | ||
489 | find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, | ||
490 | CHUNK_TRIMMED); | ||
491 | |||
492 | if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) | ||
493 | test_err("error finding exact range: start %llu end %llu", | ||
494 | start, end); | ||
495 | |||
496 | find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, | ||
497 | CHUNK_TRIMMED); | ||
498 | |||
499 | /* | ||
500 | * Search in the middle of set range whose immediate neighbour doesn't | ||
501 | * have the bits set so it must be returned | ||
502 | */ | ||
503 | if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) | ||
504 | test_err("error finding next alloc range: start %llu end %llu", | ||
505 | start, end); | ||
506 | |||
507 | /* | ||
508 | * Search beyond any known range, shall return after last known range | ||
509 | * and end should be -1 | ||
510 | */ | ||
511 | find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); | ||
512 | if (start != SZ_64M + SZ_8M || end != -1) | ||
513 | test_err( | ||
514 | "error handling beyond end of range search: start %llu end %llu", | ||
515 | start, end); | ||
516 | |||
517 | return 0; | ||
518 | } | ||
519 | |||
435 | int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) | 520 | int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) |
436 | { | 521 | { |
437 | int ret; | 522 | int ret; |
@@ -442,6 +527,10 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) | |||
442 | if (ret) | 527 | if (ret) |
443 | goto out; | 528 | goto out; |
444 | 529 | ||
530 | ret = test_find_first_clear_extent_bit(); | ||
531 | if (ret) | ||
532 | goto out; | ||
533 | |||
445 | ret = test_eb_bitmaps(sectorsize, nodesize); | 534 | ret = test_eb_bitmaps(sectorsize, nodesize); |
446 | out: | 535 | out: |
447 | return ret; | 536 | return ret; |
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 87aeabe9d610..4a7f796c9900 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c | |||
@@ -66,7 +66,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, | |||
66 | em->len = SZ_16K; | 66 | em->len = SZ_16K; |
67 | em->block_start = 0; | 67 | em->block_start = 0; |
68 | em->block_len = SZ_16K; | 68 | em->block_len = SZ_16K; |
69 | write_lock(&em_tree->lock); | ||
69 | ret = add_extent_mapping(em_tree, em, 0); | 70 | ret = add_extent_mapping(em_tree, em, 0); |
71 | write_unlock(&em_tree->lock); | ||
70 | if (ret < 0) { | 72 | if (ret < 0) { |
71 | test_err("cannot add extent range [0, 16K)"); | 73 | test_err("cannot add extent range [0, 16K)"); |
72 | goto out; | 74 | goto out; |
@@ -85,7 +87,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, | |||
85 | em->len = SZ_4K; | 87 | em->len = SZ_4K; |
86 | em->block_start = SZ_32K; /* avoid merging */ | 88 | em->block_start = SZ_32K; /* avoid merging */ |
87 | em->block_len = SZ_4K; | 89 | em->block_len = SZ_4K; |
90 | write_lock(&em_tree->lock); | ||
88 | ret = add_extent_mapping(em_tree, em, 0); | 91 | ret = add_extent_mapping(em_tree, em, 0); |
92 | write_unlock(&em_tree->lock); | ||
89 | if (ret < 0) { | 93 | if (ret < 0) { |
90 | test_err("cannot add extent range [16K, 20K)"); | 94 | test_err("cannot add extent range [16K, 20K)"); |
91 | goto out; | 95 | goto out; |
@@ -104,7 +108,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, | |||
104 | em->len = len; | 108 | em->len = len; |
105 | em->block_start = start; | 109 | em->block_start = start; |
106 | em->block_len = len; | 110 | em->block_len = len; |
111 | write_lock(&em_tree->lock); | ||
107 | ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); | 112 | ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); |
113 | write_unlock(&em_tree->lock); | ||
108 | if (ret) { | 114 | if (ret) { |
109 | test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); | 115 | test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); |
110 | goto out; | 116 | goto out; |
@@ -148,7 +154,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, | |||
148 | em->len = SZ_1K; | 154 | em->len = SZ_1K; |
149 | em->block_start = EXTENT_MAP_INLINE; | 155 | em->block_start = EXTENT_MAP_INLINE; |
150 | em->block_len = (u64)-1; | 156 | em->block_len = (u64)-1; |
157 | write_lock(&em_tree->lock); | ||
151 | ret = add_extent_mapping(em_tree, em, 0); | 158 | ret = add_extent_mapping(em_tree, em, 0); |
159 | write_unlock(&em_tree->lock); | ||
152 | if (ret < 0) { | 160 | if (ret < 0) { |
153 | test_err("cannot add extent range [0, 1K)"); | 161 | test_err("cannot add extent range [0, 1K)"); |
154 | goto out; | 162 | goto out; |
@@ -167,7 +175,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, | |||
167 | em->len = SZ_4K; | 175 | em->len = SZ_4K; |
168 | em->block_start = SZ_4K; | 176 | em->block_start = SZ_4K; |
169 | em->block_len = SZ_4K; | 177 | em->block_len = SZ_4K; |
178 | write_lock(&em_tree->lock); | ||
170 | ret = add_extent_mapping(em_tree, em, 0); | 179 | ret = add_extent_mapping(em_tree, em, 0); |
180 | write_unlock(&em_tree->lock); | ||
171 | if (ret < 0) { | 181 | if (ret < 0) { |
172 | test_err("cannot add extent range [4K, 8K)"); | 182 | test_err("cannot add extent range [4K, 8K)"); |
173 | goto out; | 183 | goto out; |
@@ -186,7 +196,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, | |||
186 | em->len = SZ_1K; | 196 | em->len = SZ_1K; |
187 | em->block_start = EXTENT_MAP_INLINE; | 197 | em->block_start = EXTENT_MAP_INLINE; |
188 | em->block_len = (u64)-1; | 198 | em->block_len = (u64)-1; |
199 | write_lock(&em_tree->lock); | ||
189 | ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); | 200 | ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); |
201 | write_unlock(&em_tree->lock); | ||
190 | if (ret) { | 202 | if (ret) { |
191 | test_err("case2 [0 1K]: ret %d", ret); | 203 | test_err("case2 [0 1K]: ret %d", ret); |
192 | goto out; | 204 | goto out; |
@@ -225,7 +237,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, | |||
225 | em->len = SZ_4K; | 237 | em->len = SZ_4K; |
226 | em->block_start = SZ_4K; | 238 | em->block_start = SZ_4K; |
227 | em->block_len = SZ_4K; | 239 | em->block_len = SZ_4K; |
240 | write_lock(&em_tree->lock); | ||
228 | ret = add_extent_mapping(em_tree, em, 0); | 241 | ret = add_extent_mapping(em_tree, em, 0); |
242 | write_unlock(&em_tree->lock); | ||
229 | if (ret < 0) { | 243 | if (ret < 0) { |
230 | test_err("cannot add extent range [4K, 8K)"); | 244 | test_err("cannot add extent range [4K, 8K)"); |
231 | goto out; | 245 | goto out; |
@@ -244,7 +258,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, | |||
244 | em->len = SZ_16K; | 258 | em->len = SZ_16K; |
245 | em->block_start = 0; | 259 | em->block_start = 0; |
246 | em->block_len = SZ_16K; | 260 | em->block_len = SZ_16K; |
261 | write_lock(&em_tree->lock); | ||
247 | ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); | 262 | ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); |
263 | write_unlock(&em_tree->lock); | ||
248 | if (ret) { | 264 | if (ret) { |
249 | test_err("case3 [0x%llx 0x%llx): ret %d", | 265 | test_err("case3 [0x%llx 0x%llx): ret %d", |
250 | start, start + len, ret); | 266 | start, start + len, ret); |
@@ -320,7 +336,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, | |||
320 | em->len = SZ_8K; | 336 | em->len = SZ_8K; |
321 | em->block_start = 0; | 337 | em->block_start = 0; |
322 | em->block_len = SZ_8K; | 338 | em->block_len = SZ_8K; |
339 | write_lock(&em_tree->lock); | ||
323 | ret = add_extent_mapping(em_tree, em, 0); | 340 | ret = add_extent_mapping(em_tree, em, 0); |
341 | write_unlock(&em_tree->lock); | ||
324 | if (ret < 0) { | 342 | if (ret < 0) { |
325 | test_err("cannot add extent range [0, 8K)"); | 343 | test_err("cannot add extent range [0, 8K)"); |
326 | goto out; | 344 | goto out; |
@@ -339,7 +357,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, | |||
339 | em->len = 24 * SZ_1K; | 357 | em->len = 24 * SZ_1K; |
340 | em->block_start = SZ_16K; /* avoid merging */ | 358 | em->block_start = SZ_16K; /* avoid merging */ |
341 | em->block_len = 24 * SZ_1K; | 359 | em->block_len = 24 * SZ_1K; |
360 | write_lock(&em_tree->lock); | ||
342 | ret = add_extent_mapping(em_tree, em, 0); | 361 | ret = add_extent_mapping(em_tree, em, 0); |
362 | write_unlock(&em_tree->lock); | ||
343 | if (ret < 0) { | 363 | if (ret < 0) { |
344 | test_err("cannot add extent range [8K, 32K)"); | 364 | test_err("cannot add extent range [8K, 32K)"); |
345 | goto out; | 365 | goto out; |
@@ -357,7 +377,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, | |||
357 | em->len = SZ_32K; | 377 | em->len = SZ_32K; |
358 | em->block_start = 0; | 378 | em->block_start = 0; |
359 | em->block_len = SZ_32K; | 379 | em->block_len = SZ_32K; |
380 | write_lock(&em_tree->lock); | ||
360 | ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); | 381 | ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); |
382 | write_unlock(&em_tree->lock); | ||
361 | if (ret) { | 383 | if (ret) { |
362 | test_err("case4 [0x%llx 0x%llx): ret %d", | 384 | test_err("case4 [0x%llx 0x%llx): ret %d", |
363 | start, len, ret); | 385 | start, len, ret); |
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3f6811cdf803..3b8ae1a8f02d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
@@ -129,6 +129,24 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans) | |||
129 | } | 129 | } |
130 | 130 | ||
131 | /* | 131 | /* |
132 | * To be called after all the new block groups attached to the transaction | ||
133 | * handle have been created (btrfs_create_pending_block_groups()). | ||
134 | */ | ||
135 | void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) | ||
136 | { | ||
137 | struct btrfs_fs_info *fs_info = trans->fs_info; | ||
138 | |||
139 | if (!trans->chunk_bytes_reserved) | ||
140 | return; | ||
141 | |||
142 | WARN_ON_ONCE(!list_empty(&trans->new_bgs)); | ||
143 | |||
144 | btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, | ||
145 | trans->chunk_bytes_reserved); | ||
146 | trans->chunk_bytes_reserved = 0; | ||
147 | } | ||
148 | |||
149 | /* | ||
132 | * either allocate a new transaction or hop into the existing one | 150 | * either allocate a new transaction or hop into the existing one |
133 | */ | 151 | */ |
134 | static noinline int join_transaction(struct btrfs_fs_info *fs_info, | 152 | static noinline int join_transaction(struct btrfs_fs_info *fs_info, |
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 78c446c222b7..527ea94b57d9 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h | |||
@@ -224,5 +224,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); | |||
224 | void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); | 224 | void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); |
225 | void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, | 225 | void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, |
226 | struct btrfs_root *root); | 226 | struct btrfs_root *root); |
227 | void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); | ||
227 | 228 | ||
228 | #endif | 229 | #endif |
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 96fce4bef4e7..ccd5706199d7 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c | |||
@@ -132,6 +132,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, | |||
132 | struct btrfs_file_extent_item *fi; | 132 | struct btrfs_file_extent_item *fi; |
133 | u32 sectorsize = fs_info->sectorsize; | 133 | u32 sectorsize = fs_info->sectorsize; |
134 | u32 item_size = btrfs_item_size_nr(leaf, slot); | 134 | u32 item_size = btrfs_item_size_nr(leaf, slot); |
135 | u64 extent_end; | ||
135 | 136 | ||
136 | if (!IS_ALIGNED(key->offset, sectorsize)) { | 137 | if (!IS_ALIGNED(key->offset, sectorsize)) { |
137 | file_extent_err(leaf, slot, | 138 | file_extent_err(leaf, slot, |
@@ -207,6 +208,16 @@ static int check_extent_data_item(struct extent_buffer *leaf, | |||
207 | CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)) | 208 | CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)) |
208 | return -EUCLEAN; | 209 | return -EUCLEAN; |
209 | 210 | ||
211 | /* Catch extent end overflow */ | ||
212 | if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi), | ||
213 | key->offset, &extent_end)) { | ||
214 | file_extent_err(leaf, slot, | ||
215 | "extent end overflow, have file offset %llu extent num bytes %llu", | ||
216 | key->offset, | ||
217 | btrfs_file_extent_num_bytes(leaf, fi)); | ||
218 | return -EUCLEAN; | ||
219 | } | ||
220 | |||
210 | /* | 221 | /* |
211 | * Check that no two consecutive file extent items, in the same leaf, | 222 | * Check that no two consecutive file extent items, in the same leaf, |
212 | * present ranges that overlap each other. | 223 | * present ranges that overlap each other. |
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3fc8d854d7fb..6c8297bcfeb7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
@@ -3323,6 +3323,30 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, | |||
3323 | } | 3323 | } |
3324 | 3324 | ||
3325 | /* | 3325 | /* |
3326 | * Check if an inode was logged in the current transaction. We can't always rely | ||
3327 | * on an inode's logged_trans value, because it's an in-memory only field and | ||
3328 | * therefore not persisted. This means that its value is lost if the inode gets | ||
3329 | * evicted and loaded again from disk (in which case it has a value of 0, and | ||
3330 | * certainly it is smaller then any possible transaction ID), when that happens | ||
3331 | * the full_sync flag is set in the inode's runtime flags, so on that case we | ||
3332 | * assume eviction happened and ignore the logged_trans value, assuming the | ||
3333 | * worst case, that the inode was logged before in the current transaction. | ||
3334 | */ | ||
3335 | static bool inode_logged(struct btrfs_trans_handle *trans, | ||
3336 | struct btrfs_inode *inode) | ||
3337 | { | ||
3338 | if (inode->logged_trans == trans->transid) | ||
3339 | return true; | ||
3340 | |||
3341 | if (inode->last_trans == trans->transid && | ||
3342 | test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && | ||
3343 | !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) | ||
3344 | return true; | ||
3345 | |||
3346 | return false; | ||
3347 | } | ||
3348 | |||
3349 | /* | ||
3326 | * If both a file and directory are logged, and unlinks or renames are | 3350 | * If both a file and directory are logged, and unlinks or renames are |
3327 | * mixed in, we have a few interesting corners: | 3351 | * mixed in, we have a few interesting corners: |
3328 | * | 3352 | * |
@@ -3356,7 +3380,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, | |||
3356 | int bytes_del = 0; | 3380 | int bytes_del = 0; |
3357 | u64 dir_ino = btrfs_ino(dir); | 3381 | u64 dir_ino = btrfs_ino(dir); |
3358 | 3382 | ||
3359 | if (dir->logged_trans < trans->transid) | 3383 | if (!inode_logged(trans, dir)) |
3360 | return 0; | 3384 | return 0; |
3361 | 3385 | ||
3362 | ret = join_running_log_trans(root); | 3386 | ret = join_running_log_trans(root); |
@@ -3460,7 +3484,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, | |||
3460 | u64 index; | 3484 | u64 index; |
3461 | int ret; | 3485 | int ret; |
3462 | 3486 | ||
3463 | if (inode->logged_trans < trans->transid) | 3487 | if (!inode_logged(trans, inode)) |
3464 | return 0; | 3488 | return 0; |
3465 | 3489 | ||
3466 | ret = join_running_log_trans(root); | 3490 | ret = join_running_log_trans(root); |
@@ -5420,9 +5444,19 @@ log_extents: | |||
5420 | } | 5444 | } |
5421 | } | 5445 | } |
5422 | 5446 | ||
5447 | /* | ||
5448 | * Don't update last_log_commit if we logged that an inode exists after | ||
5449 | * it was loaded to memory (full_sync bit set). | ||
5450 | * This is to prevent data loss when we do a write to the inode, then | ||
5451 | * the inode gets evicted after all delalloc was flushed, then we log | ||
5452 | * it exists (due to a rename for example) and then fsync it. This last | ||
5453 | * fsync would do nothing (not logging the extents previously written). | ||
5454 | */ | ||
5423 | spin_lock(&inode->lock); | 5455 | spin_lock(&inode->lock); |
5424 | inode->logged_trans = trans->transid; | 5456 | inode->logged_trans = trans->transid; |
5425 | inode->last_log_commit = inode->last_sub_trans; | 5457 | if (inode_only != LOG_INODE_EXISTS || |
5458 | !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) | ||
5459 | inode->last_log_commit = inode->last_sub_trans; | ||
5426 | spin_unlock(&inode->lock); | 5460 | spin_unlock(&inode->lock); |
5427 | out_unlock: | 5461 | out_unlock: |
5428 | mutex_unlock(&inode->log_mutex); | 5462 | mutex_unlock(&inode->log_mutex); |
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1c2a6e4b39da..a13ddba1ebc3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include "dev-replace.h" | 28 | #include "dev-replace.h" |
29 | #include "sysfs.h" | 29 | #include "sysfs.h" |
30 | #include "tree-checker.h" | 30 | #include "tree-checker.h" |
31 | #include "space-info.h" | ||
31 | 32 | ||
32 | const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { | 33 | const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { |
33 | [BTRFS_RAID_RAID10] = { | 34 | [BTRFS_RAID_RAID10] = { |
@@ -123,12 +124,14 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { | |||
123 | }, | 124 | }, |
124 | }; | 125 | }; |
125 | 126 | ||
126 | const char *get_raid_name(enum btrfs_raid_types type) | 127 | const char *btrfs_bg_type_to_raid_name(u64 flags) |
127 | { | 128 | { |
128 | if (type >= BTRFS_NR_RAID_TYPES) | 129 | const int index = btrfs_bg_flags_to_raid_index(flags); |
130 | |||
131 | if (index >= BTRFS_NR_RAID_TYPES) | ||
129 | return NULL; | 132 | return NULL; |
130 | 133 | ||
131 | return btrfs_raid_array[type].raid_name; | 134 | return btrfs_raid_array[index].raid_name; |
132 | } | 135 | } |
133 | 136 | ||
134 | /* | 137 | /* |
@@ -237,7 +240,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, | |||
237 | * chunk_mutex | 240 | * chunk_mutex |
238 | * ----------- | 241 | * ----------- |
239 | * protects chunks, adding or removing during allocation, trim or when a new | 242 | * protects chunks, adding or removing during allocation, trim or when a new |
240 | * device is added/removed | 243 | * device is added/removed. Additionally it also protects post_commit_list of |
244 | * individual devices, since they can be added to the transaction's | ||
245 | * post_commit_list only with chunk_mutex held. | ||
241 | * | 246 | * |
242 | * cleaner_mutex | 247 | * cleaner_mutex |
243 | * ------------- | 248 | * ------------- |
@@ -1818,7 +1823,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info) | |||
1818 | struct rb_node *n; | 1823 | struct rb_node *n; |
1819 | u64 ret = 0; | 1824 | u64 ret = 0; |
1820 | 1825 | ||
1821 | em_tree = &fs_info->mapping_tree.map_tree; | 1826 | em_tree = &fs_info->mapping_tree; |
1822 | read_lock(&em_tree->lock); | 1827 | read_lock(&em_tree->lock); |
1823 | n = rb_last(&em_tree->map.rb_root); | 1828 | n = rb_last(&em_tree->map.rb_root); |
1824 | if (n) { | 1829 | if (n) { |
@@ -2941,7 +2946,7 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, | |||
2941 | struct extent_map_tree *em_tree; | 2946 | struct extent_map_tree *em_tree; |
2942 | struct extent_map *em; | 2947 | struct extent_map *em; |
2943 | 2948 | ||
2944 | em_tree = &fs_info->mapping_tree.map_tree; | 2949 | em_tree = &fs_info->mapping_tree; |
2945 | read_lock(&em_tree->lock); | 2950 | read_lock(&em_tree->lock); |
2946 | em = lookup_extent_mapping(em_tree, logical, length); | 2951 | em = lookup_extent_mapping(em_tree, logical, length); |
2947 | read_unlock(&em_tree->lock); | 2952 | read_unlock(&em_tree->lock); |
@@ -3474,6 +3479,18 @@ static int chunk_devid_filter(struct extent_buffer *leaf, | |||
3474 | return 1; | 3479 | return 1; |
3475 | } | 3480 | } |
3476 | 3481 | ||
3482 | static u64 calc_data_stripes(u64 type, int num_stripes) | ||
3483 | { | ||
3484 | const int index = btrfs_bg_flags_to_raid_index(type); | ||
3485 | const int ncopies = btrfs_raid_array[index].ncopies; | ||
3486 | const int nparity = btrfs_raid_array[index].nparity; | ||
3487 | |||
3488 | if (nparity) | ||
3489 | return num_stripes - nparity; | ||
3490 | else | ||
3491 | return num_stripes / ncopies; | ||
3492 | } | ||
3493 | |||
3477 | /* [pstart, pend) */ | 3494 | /* [pstart, pend) */ |
3478 | static int chunk_drange_filter(struct extent_buffer *leaf, | 3495 | static int chunk_drange_filter(struct extent_buffer *leaf, |
3479 | struct btrfs_chunk *chunk, | 3496 | struct btrfs_chunk *chunk, |
@@ -3483,22 +3500,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf, | |||
3483 | int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); | 3500 | int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); |
3484 | u64 stripe_offset; | 3501 | u64 stripe_offset; |
3485 | u64 stripe_length; | 3502 | u64 stripe_length; |
3503 | u64 type; | ||
3486 | int factor; | 3504 | int factor; |
3487 | int i; | 3505 | int i; |
3488 | 3506 | ||
3489 | if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) | 3507 | if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) |
3490 | return 0; | 3508 | return 0; |
3491 | 3509 | ||
3492 | if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | | 3510 | type = btrfs_chunk_type(leaf, chunk); |
3493 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { | 3511 | factor = calc_data_stripes(type, num_stripes); |
3494 | factor = num_stripes / 2; | ||
3495 | } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { | ||
3496 | factor = num_stripes - 1; | ||
3497 | } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { | ||
3498 | factor = num_stripes - 2; | ||
3499 | } else { | ||
3500 | factor = num_stripes; | ||
3501 | } | ||
3502 | 3512 | ||
3503 | for (i = 0; i < num_stripes; i++) { | 3513 | for (i = 0; i < num_stripes; i++) { |
3504 | stripe = btrfs_stripe_nr(chunk, i); | 3514 | stripe = btrfs_stripe_nr(chunk, i); |
@@ -3921,11 +3931,9 @@ static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, | |||
3921 | bp += ret; \ | 3931 | bp += ret; \ |
3922 | } while (0) | 3932 | } while (0) |
3923 | 3933 | ||
3924 | if (flags & BTRFS_BALANCE_ARGS_CONVERT) { | 3934 | if (flags & BTRFS_BALANCE_ARGS_CONVERT) |
3925 | int index = btrfs_bg_flags_to_raid_index(bargs->target); | 3935 | CHECK_APPEND_1ARG("convert=%s,", |
3926 | 3936 | btrfs_bg_type_to_raid_name(bargs->target)); | |
3927 | CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index)); | ||
3928 | } | ||
3929 | 3937 | ||
3930 | if (flags & BTRFS_BALANCE_ARGS_SOFT) | 3938 | if (flags & BTRFS_BALANCE_ARGS_SOFT) |
3931 | CHECK_APPEND_NOARG("soft,"); | 3939 | CHECK_APPEND_NOARG("soft,"); |
@@ -4047,6 +4055,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, | |||
4047 | u64 num_devices; | 4055 | u64 num_devices; |
4048 | unsigned seq; | 4056 | unsigned seq; |
4049 | bool reducing_integrity; | 4057 | bool reducing_integrity; |
4058 | int i; | ||
4050 | 4059 | ||
4051 | if (btrfs_fs_closing(fs_info) || | 4060 | if (btrfs_fs_closing(fs_info) || |
4052 | atomic_read(&fs_info->balance_pause_req) || | 4061 | atomic_read(&fs_info->balance_pause_req) || |
@@ -4076,48 +4085,43 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, | |||
4076 | } | 4085 | } |
4077 | 4086 | ||
4078 | num_devices = btrfs_num_devices(fs_info); | 4087 | num_devices = btrfs_num_devices(fs_info); |
4088 | allowed = 0; | ||
4089 | for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) | ||
4090 | if (num_devices >= btrfs_raid_array[i].devs_min) | ||
4091 | allowed |= btrfs_raid_array[i].bg_flag; | ||
4079 | 4092 | ||
4080 | allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; | ||
4081 | if (num_devices > 1) | ||
4082 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); | ||
4083 | if (num_devices > 2) | ||
4084 | allowed |= BTRFS_BLOCK_GROUP_RAID5; | ||
4085 | if (num_devices > 3) | ||
4086 | allowed |= (BTRFS_BLOCK_GROUP_RAID10 | | ||
4087 | BTRFS_BLOCK_GROUP_RAID6); | ||
4088 | if (validate_convert_profile(&bctl->data, allowed)) { | 4093 | if (validate_convert_profile(&bctl->data, allowed)) { |
4089 | int index = btrfs_bg_flags_to_raid_index(bctl->data.target); | ||
4090 | |||
4091 | btrfs_err(fs_info, | 4094 | btrfs_err(fs_info, |
4092 | "balance: invalid convert data profile %s", | 4095 | "balance: invalid convert data profile %s", |
4093 | get_raid_name(index)); | 4096 | btrfs_bg_type_to_raid_name(bctl->data.target)); |
4094 | ret = -EINVAL; | 4097 | ret = -EINVAL; |
4095 | goto out; | 4098 | goto out; |
4096 | } | 4099 | } |
4097 | if (validate_convert_profile(&bctl->meta, allowed)) { | 4100 | if (validate_convert_profile(&bctl->meta, allowed)) { |
4098 | int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); | ||
4099 | |||
4100 | btrfs_err(fs_info, | 4101 | btrfs_err(fs_info, |
4101 | "balance: invalid convert metadata profile %s", | 4102 | "balance: invalid convert metadata profile %s", |
4102 | get_raid_name(index)); | 4103 | btrfs_bg_type_to_raid_name(bctl->meta.target)); |
4103 | ret = -EINVAL; | 4104 | ret = -EINVAL; |
4104 | goto out; | 4105 | goto out; |
4105 | } | 4106 | } |
4106 | if (validate_convert_profile(&bctl->sys, allowed)) { | 4107 | if (validate_convert_profile(&bctl->sys, allowed)) { |
4107 | int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); | ||
4108 | |||
4109 | btrfs_err(fs_info, | 4108 | btrfs_err(fs_info, |
4110 | "balance: invalid convert system profile %s", | 4109 | "balance: invalid convert system profile %s", |
4111 | get_raid_name(index)); | 4110 | btrfs_bg_type_to_raid_name(bctl->sys.target)); |
4112 | ret = -EINVAL; | 4111 | ret = -EINVAL; |
4113 | goto out; | 4112 | goto out; |
4114 | } | 4113 | } |
4115 | 4114 | ||
4116 | /* allow to reduce meta or sys integrity only if force set */ | 4115 | /* |
4117 | allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | | 4116 | * Allow to reduce metadata or system integrity only if force set for |
4118 | BTRFS_BLOCK_GROUP_RAID10 | | 4117 | * profiles with redundancy (copies, parity) |
4119 | BTRFS_BLOCK_GROUP_RAID5 | | 4118 | */ |
4120 | BTRFS_BLOCK_GROUP_RAID6; | 4119 | allowed = 0; |
4120 | for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { | ||
4121 | if (btrfs_raid_array[i].ncopies >= 2 || | ||
4122 | btrfs_raid_array[i].tolerated_failures >= 1) | ||
4123 | allowed |= btrfs_raid_array[i].bg_flag; | ||
4124 | } | ||
4121 | do { | 4125 | do { |
4122 | seq = read_seqbegin(&fs_info->profiles_lock); | 4126 | seq = read_seqbegin(&fs_info->profiles_lock); |
4123 | 4127 | ||
@@ -4152,12 +4156,18 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, | |||
4152 | 4156 | ||
4153 | if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < | 4157 | if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < |
4154 | btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { | 4158 | btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { |
4155 | int meta_index = btrfs_bg_flags_to_raid_index(meta_target); | ||
4156 | int data_index = btrfs_bg_flags_to_raid_index(data_target); | ||
4157 | |||
4158 | btrfs_warn(fs_info, | 4159 | btrfs_warn(fs_info, |
4159 | "balance: metadata profile %s has lower redundancy than data profile %s", | 4160 | "balance: metadata profile %s has lower redundancy than data profile %s", |
4160 | get_raid_name(meta_index), get_raid_name(data_index)); | 4161 | btrfs_bg_type_to_raid_name(meta_target), |
4162 | btrfs_bg_type_to_raid_name(data_target)); | ||
4163 | } | ||
4164 | |||
4165 | if (fs_info->send_in_progress) { | ||
4166 | btrfs_warn_rl(fs_info, | ||
4167 | "cannot run balance while send operations are in progress (%d in progress)", | ||
4168 | fs_info->send_in_progress); | ||
4169 | ret = -EAGAIN; | ||
4170 | goto out; | ||
4161 | } | 4171 | } |
4162 | 4172 | ||
4163 | ret = insert_balance_item(fs_info, bctl); | 4173 | ret = insert_balance_item(fs_info, bctl); |
@@ -4949,6 +4959,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
4949 | sub_stripes = btrfs_raid_array[index].sub_stripes; | 4959 | sub_stripes = btrfs_raid_array[index].sub_stripes; |
4950 | dev_stripes = btrfs_raid_array[index].dev_stripes; | 4960 | dev_stripes = btrfs_raid_array[index].dev_stripes; |
4951 | devs_max = btrfs_raid_array[index].devs_max; | 4961 | devs_max = btrfs_raid_array[index].devs_max; |
4962 | if (!devs_max) | ||
4963 | devs_max = BTRFS_MAX_DEVS(info); | ||
4952 | devs_min = btrfs_raid_array[index].devs_min; | 4964 | devs_min = btrfs_raid_array[index].devs_min; |
4953 | devs_increment = btrfs_raid_array[index].devs_increment; | 4965 | devs_increment = btrfs_raid_array[index].devs_increment; |
4954 | ncopies = btrfs_raid_array[index].ncopies; | 4966 | ncopies = btrfs_raid_array[index].ncopies; |
@@ -4957,8 +4969,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
4957 | if (type & BTRFS_BLOCK_GROUP_DATA) { | 4969 | if (type & BTRFS_BLOCK_GROUP_DATA) { |
4958 | max_stripe_size = SZ_1G; | 4970 | max_stripe_size = SZ_1G; |
4959 | max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; | 4971 | max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; |
4960 | if (!devs_max) | ||
4961 | devs_max = BTRFS_MAX_DEVS(info); | ||
4962 | } else if (type & BTRFS_BLOCK_GROUP_METADATA) { | 4972 | } else if (type & BTRFS_BLOCK_GROUP_METADATA) { |
4963 | /* for larger filesystems, use larger metadata chunks */ | 4973 | /* for larger filesystems, use larger metadata chunks */ |
4964 | if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) | 4974 | if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) |
@@ -4966,13 +4976,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
4966 | else | 4976 | else |
4967 | max_stripe_size = SZ_256M; | 4977 | max_stripe_size = SZ_256M; |
4968 | max_chunk_size = max_stripe_size; | 4978 | max_chunk_size = max_stripe_size; |
4969 | if (!devs_max) | ||
4970 | devs_max = BTRFS_MAX_DEVS(info); | ||
4971 | } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { | 4979 | } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { |
4972 | max_stripe_size = SZ_32M; | 4980 | max_stripe_size = SZ_32M; |
4973 | max_chunk_size = 2 * max_stripe_size; | 4981 | max_chunk_size = 2 * max_stripe_size; |
4974 | if (!devs_max) | ||
4975 | devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; | ||
4976 | } else { | 4982 | } else { |
4977 | btrfs_err(info, "invalid chunk type 0x%llx requested", | 4983 | btrfs_err(info, "invalid chunk type 0x%llx requested", |
4978 | type); | 4984 | type); |
@@ -5143,7 +5149,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
5143 | em->block_len = em->len; | 5149 | em->block_len = em->len; |
5144 | em->orig_block_len = stripe_size; | 5150 | em->orig_block_len = stripe_size; |
5145 | 5151 | ||
5146 | em_tree = &info->mapping_tree.map_tree; | 5152 | em_tree = &info->mapping_tree; |
5147 | write_lock(&em_tree->lock); | 5153 | write_lock(&em_tree->lock); |
5148 | ret = add_extent_mapping(em_tree, em, 0); | 5154 | ret = add_extent_mapping(em_tree, em, 0); |
5149 | if (ret) { | 5155 | if (ret) { |
@@ -5324,20 +5330,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) | |||
5324 | 5330 | ||
5325 | static inline int btrfs_chunk_max_errors(struct map_lookup *map) | 5331 | static inline int btrfs_chunk_max_errors(struct map_lookup *map) |
5326 | { | 5332 | { |
5327 | int max_errors; | 5333 | const int index = btrfs_bg_flags_to_raid_index(map->type); |
5328 | 5334 | ||
5329 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | | 5335 | return btrfs_raid_array[index].tolerated_failures; |
5330 | BTRFS_BLOCK_GROUP_RAID10 | | ||
5331 | BTRFS_BLOCK_GROUP_RAID5 | | ||
5332 | BTRFS_BLOCK_GROUP_DUP)) { | ||
5333 | max_errors = 1; | ||
5334 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { | ||
5335 | max_errors = 2; | ||
5336 | } else { | ||
5337 | max_errors = 0; | ||
5338 | } | ||
5339 | |||
5340 | return max_errors; | ||
5341 | } | 5336 | } |
5342 | 5337 | ||
5343 | int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) | 5338 | int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) |
@@ -5378,21 +5373,16 @@ end: | |||
5378 | return readonly; | 5373 | return readonly; |
5379 | } | 5374 | } |
5380 | 5375 | ||
5381 | void btrfs_mapping_init(struct btrfs_mapping_tree *tree) | 5376 | void btrfs_mapping_tree_free(struct extent_map_tree *tree) |
5382 | { | ||
5383 | extent_map_tree_init(&tree->map_tree); | ||
5384 | } | ||
5385 | |||
5386 | void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) | ||
5387 | { | 5377 | { |
5388 | struct extent_map *em; | 5378 | struct extent_map *em; |
5389 | 5379 | ||
5390 | while (1) { | 5380 | while (1) { |
5391 | write_lock(&tree->map_tree.lock); | 5381 | write_lock(&tree->lock); |
5392 | em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); | 5382 | em = lookup_extent_mapping(tree, 0, (u64)-1); |
5393 | if (em) | 5383 | if (em) |
5394 | remove_extent_mapping(&tree->map_tree, em); | 5384 | remove_extent_mapping(tree, em); |
5395 | write_unlock(&tree->map_tree.lock); | 5385 | write_unlock(&tree->lock); |
5396 | if (!em) | 5386 | if (!em) |
5397 | break; | 5387 | break; |
5398 | /* once for us */ | 5388 | /* once for us */ |
@@ -5419,7 +5409,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) | |||
5419 | return 1; | 5409 | return 1; |
5420 | 5410 | ||
5421 | map = em->map_lookup; | 5411 | map = em->map_lookup; |
5422 | if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) | 5412 | if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) |
5423 | ret = map->num_stripes; | 5413 | ret = map->num_stripes; |
5424 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) | 5414 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
5425 | ret = map->sub_stripes; | 5415 | ret = map->sub_stripes; |
@@ -5493,7 +5483,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, | |||
5493 | struct btrfs_device *srcdev; | 5483 | struct btrfs_device *srcdev; |
5494 | 5484 | ||
5495 | ASSERT((map->type & | 5485 | ASSERT((map->type & |
5496 | (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); | 5486 | (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); |
5497 | 5487 | ||
5498 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) | 5488 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
5499 | num_stripes = map->sub_stripes; | 5489 | num_stripes = map->sub_stripes; |
@@ -5682,7 +5672,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, | |||
5682 | &remaining_stripes); | 5672 | &remaining_stripes); |
5683 | div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); | 5673 | div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); |
5684 | last_stripe *= sub_stripes; | 5674 | last_stripe *= sub_stripes; |
5685 | } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | | 5675 | } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | |
5686 | BTRFS_BLOCK_GROUP_DUP)) { | 5676 | BTRFS_BLOCK_GROUP_DUP)) { |
5687 | num_stripes = map->num_stripes; | 5677 | num_stripes = map->num_stripes; |
5688 | } else { | 5678 | } else { |
@@ -5926,6 +5916,102 @@ static bool need_full_stripe(enum btrfs_map_op op) | |||
5926 | return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); | 5916 | return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); |
5927 | } | 5917 | } |
5928 | 5918 | ||
5919 | /* | ||
5920 | * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) | ||
5921 | * tuple. This information is used to calculate how big a | ||
5922 | * particular bio can get before it straddles a stripe. | ||
5923 | * | ||
5924 | * @fs_info - the filesystem | ||
5925 | * @logical - address that we want to figure out the geometry of | ||
5926 | * @len - the length of IO we are going to perform, starting at @logical | ||
5927 | * @op - type of operation - write or read | ||
5928 | * @io_geom - pointer used to return values | ||
5929 | * | ||
5930 | * Returns < 0 in case a chunk for the given logical address cannot be found, | ||
5931 | * usually shouldn't happen unless @logical is corrupted, 0 otherwise. | ||
5932 | */ | ||
5933 | int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, | ||
5934 | u64 logical, u64 len, struct btrfs_io_geometry *io_geom) | ||
5935 | { | ||
5936 | struct extent_map *em; | ||
5937 | struct map_lookup *map; | ||
5938 | u64 offset; | ||
5939 | u64 stripe_offset; | ||
5940 | u64 stripe_nr; | ||
5941 | u64 stripe_len; | ||
5942 | u64 raid56_full_stripe_start = (u64)-1; | ||
5943 | int data_stripes; | ||
5944 | |||
5945 | ASSERT(op != BTRFS_MAP_DISCARD); | ||
5946 | |||
5947 | em = btrfs_get_chunk_map(fs_info, logical, len); | ||
5948 | if (IS_ERR(em)) | ||
5949 | return PTR_ERR(em); | ||
5950 | |||
5951 | map = em->map_lookup; | ||
5952 | /* Offset of this logical address in the chunk */ | ||
5953 | offset = logical - em->start; | ||
5954 | /* Len of a stripe in a chunk */ | ||
5955 | stripe_len = map->stripe_len; | ||
5956 | /* Stripe wher this block falls in */ | ||
5957 | stripe_nr = div64_u64(offset, stripe_len); | ||
5958 | /* Offset of stripe in the chunk */ | ||
5959 | stripe_offset = stripe_nr * stripe_len; | ||
5960 | if (offset < stripe_offset) { | ||
5961 | btrfs_crit(fs_info, | ||
5962 | "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", | ||
5963 | stripe_offset, offset, em->start, logical, stripe_len); | ||
5964 | free_extent_map(em); | ||
5965 | return -EINVAL; | ||
5966 | } | ||
5967 | |||
5968 | /* stripe_offset is the offset of this block in its stripe */ | ||
5969 | stripe_offset = offset - stripe_offset; | ||
5970 | data_stripes = nr_data_stripes(map); | ||
5971 | |||
5972 | if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { | ||
5973 | u64 max_len = stripe_len - stripe_offset; | ||
5974 | |||
5975 | /* | ||
5976 | * In case of raid56, we need to know the stripe aligned start | ||
5977 | */ | ||
5978 | if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { | ||
5979 | unsigned long full_stripe_len = stripe_len * data_stripes; | ||
5980 | raid56_full_stripe_start = offset; | ||
5981 | |||
5982 | /* | ||
5983 | * Allow a write of a full stripe, but make sure we | ||
5984 | * don't allow straddling of stripes | ||
5985 | */ | ||
5986 | raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, | ||
5987 | full_stripe_len); | ||
5988 | raid56_full_stripe_start *= full_stripe_len; | ||
5989 | |||
5990 | /* | ||
5991 | * For writes to RAID[56], allow a full stripeset across | ||
5992 | * all disks. For other RAID types and for RAID[56] | ||
5993 | * reads, just allow a single stripe (on a single disk). | ||
5994 | */ | ||
5995 | if (op == BTRFS_MAP_WRITE) { | ||
5996 | max_len = stripe_len * data_stripes - | ||
5997 | (offset - raid56_full_stripe_start); | ||
5998 | } | ||
5999 | } | ||
6000 | len = min_t(u64, em->len - offset, max_len); | ||
6001 | } else { | ||
6002 | len = em->len - offset; | ||
6003 | } | ||
6004 | |||
6005 | io_geom->len = len; | ||
6006 | io_geom->offset = offset; | ||
6007 | io_geom->stripe_len = stripe_len; | ||
6008 | io_geom->stripe_nr = stripe_nr; | ||
6009 | io_geom->stripe_offset = stripe_offset; | ||
6010 | io_geom->raid56_stripe_offset = raid56_full_stripe_start; | ||
6011 | |||
6012 | return 0; | ||
6013 | } | ||
6014 | |||
5929 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, | 6015 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, |
5930 | enum btrfs_map_op op, | 6016 | enum btrfs_map_op op, |
5931 | u64 logical, u64 *length, | 6017 | u64 logical, u64 *length, |
@@ -5939,6 +6025,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, | |||
5939 | u64 stripe_nr; | 6025 | u64 stripe_nr; |
5940 | u64 stripe_len; | 6026 | u64 stripe_len; |
5941 | u32 stripe_index; | 6027 | u32 stripe_index; |
6028 | int data_stripes; | ||
5942 | int i; | 6029 | int i; |
5943 | int ret = 0; | 6030 | int ret = 0; |
5944 | int num_stripes; | 6031 | int num_stripes; |
@@ -5951,76 +6038,29 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, | |||
5951 | int patch_the_first_stripe_for_dev_replace = 0; | 6038 | int patch_the_first_stripe_for_dev_replace = 0; |
5952 | u64 physical_to_patch_in_first_stripe = 0; | 6039 | u64 physical_to_patch_in_first_stripe = 0; |
5953 | u64 raid56_full_stripe_start = (u64)-1; | 6040 | u64 raid56_full_stripe_start = (u64)-1; |
6041 | struct btrfs_io_geometry geom; | ||
6042 | |||
6043 | ASSERT(bbio_ret); | ||
5954 | 6044 | ||
5955 | if (op == BTRFS_MAP_DISCARD) | 6045 | if (op == BTRFS_MAP_DISCARD) |
5956 | return __btrfs_map_block_for_discard(fs_info, logical, | 6046 | return __btrfs_map_block_for_discard(fs_info, logical, |
5957 | *length, bbio_ret); | 6047 | *length, bbio_ret); |
5958 | 6048 | ||
5959 | em = btrfs_get_chunk_map(fs_info, logical, *length); | 6049 | ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); |
5960 | if (IS_ERR(em)) | 6050 | if (ret < 0) |
5961 | return PTR_ERR(em); | 6051 | return ret; |
5962 | 6052 | ||
6053 | em = btrfs_get_chunk_map(fs_info, logical, *length); | ||
6054 | ASSERT(em); | ||
5963 | map = em->map_lookup; | 6055 | map = em->map_lookup; |
5964 | offset = logical - em->start; | ||
5965 | |||
5966 | stripe_len = map->stripe_len; | ||
5967 | stripe_nr = offset; | ||
5968 | /* | ||
5969 | * stripe_nr counts the total number of stripes we have to stride | ||
5970 | * to get to this block | ||
5971 | */ | ||
5972 | stripe_nr = div64_u64(stripe_nr, stripe_len); | ||
5973 | |||
5974 | stripe_offset = stripe_nr * stripe_len; | ||
5975 | if (offset < stripe_offset) { | ||
5976 | btrfs_crit(fs_info, | ||
5977 | "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", | ||
5978 | stripe_offset, offset, em->start, logical, | ||
5979 | stripe_len); | ||
5980 | free_extent_map(em); | ||
5981 | return -EINVAL; | ||
5982 | } | ||
5983 | |||
5984 | /* stripe_offset is the offset of this block in its stripe*/ | ||
5985 | stripe_offset = offset - stripe_offset; | ||
5986 | |||
5987 | /* if we're here for raid56, we need to know the stripe aligned start */ | ||
5988 | if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { | ||
5989 | unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); | ||
5990 | raid56_full_stripe_start = offset; | ||
5991 | 6056 | ||
5992 | /* allow a write of a full stripe, but make sure we don't | 6057 | *length = geom.len; |
5993 | * allow straddling of stripes | 6058 | offset = geom.offset; |
5994 | */ | 6059 | stripe_len = geom.stripe_len; |
5995 | raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, | 6060 | stripe_nr = geom.stripe_nr; |
5996 | full_stripe_len); | 6061 | stripe_offset = geom.stripe_offset; |
5997 | raid56_full_stripe_start *= full_stripe_len; | 6062 | raid56_full_stripe_start = geom.raid56_stripe_offset; |
5998 | } | 6063 | data_stripes = nr_data_stripes(map); |
5999 | |||
6000 | if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { | ||
6001 | u64 max_len; | ||
6002 | /* For writes to RAID[56], allow a full stripeset across all disks. | ||
6003 | For other RAID types and for RAID[56] reads, just allow a single | ||
6004 | stripe (on a single disk). */ | ||
6005 | if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && | ||
6006 | (op == BTRFS_MAP_WRITE)) { | ||
6007 | max_len = stripe_len * nr_data_stripes(map) - | ||
6008 | (offset - raid56_full_stripe_start); | ||
6009 | } else { | ||
6010 | /* we limit the length of each bio to what fits in a stripe */ | ||
6011 | max_len = stripe_len - stripe_offset; | ||
6012 | } | ||
6013 | *length = min_t(u64, em->len - offset, max_len); | ||
6014 | } else { | ||
6015 | *length = em->len - offset; | ||
6016 | } | ||
6017 | |||
6018 | /* | ||
6019 | * This is for when we're called from btrfs_bio_fits_in_stripe and all | ||
6020 | * it cares about is the length | ||
6021 | */ | ||
6022 | if (!bbio_ret) | ||
6023 | goto out; | ||
6024 | 6064 | ||
6025 | down_read(&dev_replace->rwsem); | 6065 | down_read(&dev_replace->rwsem); |
6026 | dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); | 6066 | dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); |
@@ -6052,7 +6092,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, | |||
6052 | &stripe_index); | 6092 | &stripe_index); |
6053 | if (!need_full_stripe(op)) | 6093 | if (!need_full_stripe(op)) |
6054 | mirror_num = 1; | 6094 | mirror_num = 1; |
6055 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { | 6095 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { |
6056 | if (need_full_stripe(op)) | 6096 | if (need_full_stripe(op)) |
6057 | num_stripes = map->num_stripes; | 6097 | num_stripes = map->num_stripes; |
6058 | else if (mirror_num) | 6098 | else if (mirror_num) |
@@ -6094,7 +6134,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, | |||
6094 | if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { | 6134 | if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { |
6095 | /* push stripe_nr back to the start of the full stripe */ | 6135 | /* push stripe_nr back to the start of the full stripe */ |
6096 | stripe_nr = div64_u64(raid56_full_stripe_start, | 6136 | stripe_nr = div64_u64(raid56_full_stripe_start, |
6097 | stripe_len * nr_data_stripes(map)); | 6137 | stripe_len * data_stripes); |
6098 | 6138 | ||
6099 | /* RAID[56] write or recovery. Return all stripes */ | 6139 | /* RAID[56] write or recovery. Return all stripes */ |
6100 | num_stripes = map->num_stripes; | 6140 | num_stripes = map->num_stripes; |
@@ -6110,10 +6150,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, | |||
6110 | * Mirror #3 is RAID6 Q block. | 6150 | * Mirror #3 is RAID6 Q block. |
6111 | */ | 6151 | */ |
6112 | stripe_nr = div_u64_rem(stripe_nr, | 6152 | stripe_nr = div_u64_rem(stripe_nr, |
6113 | nr_data_stripes(map), &stripe_index); | 6153 | data_stripes, &stripe_index); |
6114 | if (mirror_num > 1) | 6154 | if (mirror_num > 1) |
6115 | stripe_index = nr_data_stripes(map) + | 6155 | stripe_index = data_stripes + mirror_num - 2; |
6116 | mirror_num - 2; | ||
6117 | 6156 | ||
6118 | /* We distribute the parity blocks across stripes */ | 6157 | /* We distribute the parity blocks across stripes */ |
6119 | div_u64_rem(stripe_nr + stripe_index, map->num_stripes, | 6158 | div_u64_rem(stripe_nr + stripe_index, map->num_stripes, |
@@ -6171,8 +6210,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, | |||
6171 | div_u64_rem(stripe_nr, num_stripes, &rot); | 6210 | div_u64_rem(stripe_nr, num_stripes, &rot); |
6172 | 6211 | ||
6173 | /* Fill in the logical address of each stripe */ | 6212 | /* Fill in the logical address of each stripe */ |
6174 | tmp = stripe_nr * nr_data_stripes(map); | 6213 | tmp = stripe_nr * data_stripes; |
6175 | for (i = 0; i < nr_data_stripes(map); i++) | 6214 | for (i = 0; i < data_stripes; i++) |
6176 | bbio->raid_map[(i+rot) % num_stripes] = | 6215 | bbio->raid_map[(i+rot) % num_stripes] = |
6177 | em->start + (tmp + i) * map->stripe_len; | 6216 | em->start + (tmp + i) * map->stripe_len; |
6178 | 6217 | ||
@@ -6687,7 +6726,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, | |||
6687 | struct btrfs_chunk *chunk) | 6726 | struct btrfs_chunk *chunk) |
6688 | { | 6727 | { |
6689 | struct btrfs_fs_info *fs_info = leaf->fs_info; | 6728 | struct btrfs_fs_info *fs_info = leaf->fs_info; |
6690 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | 6729 | struct extent_map_tree *map_tree = &fs_info->mapping_tree; |
6691 | struct map_lookup *map; | 6730 | struct map_lookup *map; |
6692 | struct extent_map *em; | 6731 | struct extent_map *em; |
6693 | u64 logical; | 6732 | u64 logical; |
@@ -6712,9 +6751,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, | |||
6712 | return ret; | 6751 | return ret; |
6713 | } | 6752 | } |
6714 | 6753 | ||
6715 | read_lock(&map_tree->map_tree.lock); | 6754 | read_lock(&map_tree->lock); |
6716 | em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); | 6755 | em = lookup_extent_mapping(map_tree, logical, 1); |
6717 | read_unlock(&map_tree->map_tree.lock); | 6756 | read_unlock(&map_tree->lock); |
6718 | 6757 | ||
6719 | /* already mapped? */ | 6758 | /* already mapped? */ |
6720 | if (em && em->start <= logical && em->start + em->len > logical) { | 6759 | if (em && em->start <= logical && em->start + em->len > logical) { |
@@ -6783,9 +6822,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, | |||
6783 | 6822 | ||
6784 | } | 6823 | } |
6785 | 6824 | ||
6786 | write_lock(&map_tree->map_tree.lock); | 6825 | write_lock(&map_tree->lock); |
6787 | ret = add_extent_mapping(&map_tree->map_tree, em, 0); | 6826 | ret = add_extent_mapping(map_tree, em, 0); |
6788 | write_unlock(&map_tree->map_tree.lock); | 6827 | write_unlock(&map_tree->lock); |
6789 | if (ret < 0) { | 6828 | if (ret < 0) { |
6790 | btrfs_err(fs_info, | 6829 | btrfs_err(fs_info, |
6791 | "failed to add chunk map, start=%llu len=%llu: %d", | 6830 | "failed to add chunk map, start=%llu len=%llu: %d", |
@@ -7103,14 +7142,14 @@ out_short_read: | |||
7103 | bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, | 7142 | bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, |
7104 | struct btrfs_device *failing_dev) | 7143 | struct btrfs_device *failing_dev) |
7105 | { | 7144 | { |
7106 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | 7145 | struct extent_map_tree *map_tree = &fs_info->mapping_tree; |
7107 | struct extent_map *em; | 7146 | struct extent_map *em; |
7108 | u64 next_start = 0; | 7147 | u64 next_start = 0; |
7109 | bool ret = true; | 7148 | bool ret = true; |
7110 | 7149 | ||
7111 | read_lock(&map_tree->map_tree.lock); | 7150 | read_lock(&map_tree->lock); |
7112 | em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); | 7151 | em = lookup_extent_mapping(map_tree, 0, (u64)-1); |
7113 | read_unlock(&map_tree->map_tree.lock); | 7152 | read_unlock(&map_tree->lock); |
7114 | /* No chunk at all? Return false anyway */ | 7153 | /* No chunk at all? Return false anyway */ |
7115 | if (!em) { | 7154 | if (!em) { |
7116 | ret = false; | 7155 | ret = false; |
@@ -7148,10 +7187,10 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, | |||
7148 | next_start = extent_map_end(em); | 7187 | next_start = extent_map_end(em); |
7149 | free_extent_map(em); | 7188 | free_extent_map(em); |
7150 | 7189 | ||
7151 | read_lock(&map_tree->map_tree.lock); | 7190 | read_lock(&map_tree->lock); |
7152 | em = lookup_extent_mapping(&map_tree->map_tree, next_start, | 7191 | em = lookup_extent_mapping(map_tree, next_start, |
7153 | (u64)(-1) - next_start); | 7192 | (u64)(-1) - next_start); |
7154 | read_unlock(&map_tree->map_tree.lock); | 7193 | read_unlock(&map_tree->lock); |
7155 | } | 7194 | } |
7156 | out: | 7195 | out: |
7157 | return ret; | 7196 | return ret; |
@@ -7600,10 +7639,9 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) | |||
7600 | */ | 7639 | */ |
7601 | int btrfs_bg_type_to_factor(u64 flags) | 7640 | int btrfs_bg_type_to_factor(u64 flags) |
7602 | { | 7641 | { |
7603 | if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | | 7642 | const int index = btrfs_bg_flags_to_raid_index(flags); |
7604 | BTRFS_BLOCK_GROUP_RAID10)) | 7643 | |
7605 | return 2; | 7644 | return btrfs_raid_array[index].ncopies; |
7606 | return 1; | ||
7607 | } | 7645 | } |
7608 | 7646 | ||
7609 | 7647 | ||
@@ -7612,7 +7650,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, | |||
7612 | u64 chunk_offset, u64 devid, | 7650 | u64 chunk_offset, u64 devid, |
7613 | u64 physical_offset, u64 physical_len) | 7651 | u64 physical_offset, u64 physical_len) |
7614 | { | 7652 | { |
7615 | struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; | 7653 | struct extent_map_tree *em_tree = &fs_info->mapping_tree; |
7616 | struct extent_map *em; | 7654 | struct extent_map *em; |
7617 | struct map_lookup *map; | 7655 | struct map_lookup *map; |
7618 | struct btrfs_device *dev; | 7656 | struct btrfs_device *dev; |
@@ -7701,7 +7739,7 @@ out: | |||
7701 | 7739 | ||
7702 | static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) | 7740 | static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) |
7703 | { | 7741 | { |
7704 | struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; | 7742 | struct extent_map_tree *em_tree = &fs_info->mapping_tree; |
7705 | struct extent_map *em; | 7743 | struct extent_map *em; |
7706 | struct rb_node *node; | 7744 | struct rb_node *node; |
7707 | int ret = 0; | 7745 | int ret = 0; |
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 136a3eb64604..7f6aa1816409 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h | |||
@@ -23,6 +23,21 @@ struct btrfs_pending_bios { | |||
23 | struct bio *tail; | 23 | struct bio *tail; |
24 | }; | 24 | }; |
25 | 25 | ||
26 | struct btrfs_io_geometry { | ||
27 | /* remaining bytes before crossing a stripe */ | ||
28 | u64 len; | ||
29 | /* offset of logical address in chunk */ | ||
30 | u64 offset; | ||
31 | /* length of single IO stripe */ | ||
32 | u64 stripe_len; | ||
33 | /* number of stripe where address falls */ | ||
34 | u64 stripe_nr; | ||
35 | /* offset of address in stripe */ | ||
36 | u64 stripe_offset; | ||
37 | /* offset of raid56 stripe into the chunk */ | ||
38 | u64 raid56_stripe_offset; | ||
39 | }; | ||
40 | |||
26 | /* | 41 | /* |
27 | * Use sequence counter to get consistent device stat data on | 42 | * Use sequence counter to get consistent device stat data on |
28 | * 32-bit processors. | 43 | * 32-bit processors. |
@@ -43,8 +58,8 @@ struct btrfs_pending_bios { | |||
43 | #define BTRFS_DEV_STATE_FLUSH_SENT (4) | 58 | #define BTRFS_DEV_STATE_FLUSH_SENT (4) |
44 | 59 | ||
45 | struct btrfs_device { | 60 | struct btrfs_device { |
46 | struct list_head dev_list; | 61 | struct list_head dev_list; /* device_list_mutex */ |
47 | struct list_head dev_alloc_list; | 62 | struct list_head dev_alloc_list; /* chunk mutex */ |
48 | struct list_head post_commit_list; /* chunk mutex */ | 63 | struct list_head post_commit_list; /* chunk mutex */ |
49 | struct btrfs_fs_devices *fs_devices; | 64 | struct btrfs_fs_devices *fs_devices; |
50 | struct btrfs_fs_info *fs_info; | 65 | struct btrfs_fs_info *fs_info; |
@@ -229,9 +244,14 @@ struct btrfs_fs_devices { | |||
229 | * this mutex lock. | 244 | * this mutex lock. |
230 | */ | 245 | */ |
231 | struct mutex device_list_mutex; | 246 | struct mutex device_list_mutex; |
247 | |||
248 | /* List of all devices, protected by device_list_mutex */ | ||
232 | struct list_head devices; | 249 | struct list_head devices; |
233 | 250 | ||
234 | /* devices not currently being allocated */ | 251 | /* |
252 | * Devices which can satisfy space allocation. Protected by | ||
253 | * chunk_mutex | ||
254 | */ | ||
235 | struct list_head alloc_list; | 255 | struct list_head alloc_list; |
236 | 256 | ||
237 | struct btrfs_fs_devices *seed; | 257 | struct btrfs_fs_devices *seed; |
@@ -336,16 +356,16 @@ struct btrfs_device_info { | |||
336 | }; | 356 | }; |
337 | 357 | ||
338 | struct btrfs_raid_attr { | 358 | struct btrfs_raid_attr { |
339 | int sub_stripes; /* sub_stripes info for map */ | 359 | u8 sub_stripes; /* sub_stripes info for map */ |
340 | int dev_stripes; /* stripes per dev */ | 360 | u8 dev_stripes; /* stripes per dev */ |
341 | int devs_max; /* max devs to use */ | 361 | u8 devs_max; /* max devs to use */ |
342 | int devs_min; /* min devs needed */ | 362 | u8 devs_min; /* min devs needed */ |
343 | int tolerated_failures; /* max tolerated fail devs */ | 363 | u8 tolerated_failures; /* max tolerated fail devs */ |
344 | int devs_increment; /* ndevs has to be a multiple of this */ | 364 | u8 devs_increment; /* ndevs has to be a multiple of this */ |
345 | int ncopies; /* how many copies to data has */ | 365 | u8 ncopies; /* how many copies to data has */ |
346 | int nparity; /* number of stripes worth of bytes to store | 366 | u8 nparity; /* number of stripes worth of bytes to store |
347 | * parity information */ | 367 | * parity information */ |
348 | int mindev_error; /* error code if min devs requisite is unmet */ | 368 | u8 mindev_error; /* error code if min devs requisite is unmet */ |
349 | const char raid_name[8]; /* name of the raid */ | 369 | const char raid_name[8]; /* name of the raid */ |
350 | u64 bg_flag; /* block group flag of the raid */ | 370 | u64 bg_flag; /* block group flag of the raid */ |
351 | }; | 371 | }; |
@@ -408,13 +428,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, | |||
408 | int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, | 428 | int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, |
409 | u64 logical, u64 *length, | 429 | u64 logical, u64 *length, |
410 | struct btrfs_bio **bbio_ret); | 430 | struct btrfs_bio **bbio_ret); |
431 | int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, | ||
432 | u64 logical, u64 len, struct btrfs_io_geometry *io_geom); | ||
411 | int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, | 433 | int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, |
412 | u64 physical, u64 **logical, int *naddrs, int *stripe_len); | 434 | u64 physical, u64 **logical, int *naddrs, int *stripe_len); |
413 | int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); | 435 | int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); |
414 | int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); | 436 | int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); |
415 | int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); | 437 | int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); |
416 | void btrfs_mapping_init(struct btrfs_mapping_tree *tree); | 438 | void btrfs_mapping_tree_free(struct extent_map_tree *tree); |
417 | void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); | ||
418 | blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, | 439 | blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, |
419 | int mirror_num, int async_submit); | 440 | int mirror_num, int async_submit); |
420 | int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | 441 | int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, |
@@ -557,8 +578,6 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) | |||
557 | return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ | 578 | return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ |
558 | } | 579 | } |
559 | 580 | ||
560 | const char *get_raid_name(enum btrfs_raid_types type); | ||
561 | |||
562 | void btrfs_commit_device_sizes(struct btrfs_transaction *trans); | 581 | void btrfs_commit_device_sizes(struct btrfs_transaction *trans); |
563 | 582 | ||
564 | struct list_head *btrfs_get_fs_uuids(void); | 583 | struct list_head *btrfs_get_fs_uuids(void); |
@@ -568,6 +587,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, | |||
568 | struct btrfs_device *failing_dev); | 587 | struct btrfs_device *failing_dev); |
569 | 588 | ||
570 | int btrfs_bg_type_to_factor(u64 flags); | 589 | int btrfs_bg_type_to_factor(u64 flags); |
590 | const char *btrfs_bg_type_to_raid_name(u64 flags); | ||
571 | int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); | 591 | int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); |
572 | 592 | ||
573 | #endif | 593 | #endif |