aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2009-12-07 01:28:35 -0500
committerFrederic Weisbecker <fweisbec@gmail.com>2009-12-07 01:29:22 -0500
commit6548698f929814375fa5d62ae1db96959b0418c1 (patch)
tree340924ae82cb0946aa15045b2b72186de52a8146 /fs/btrfs/extent-tree.c
parent1d2c6cfd40b2dece3bb958cbbc405a2c1536ab75 (diff)
parent22763c5cf3690a681551162c15d34d935308c8d7 (diff)
Merge commit 'v2.6.32' into reiserfs/kill-bkl
Merge-reason: The tree was based 2.6.31. It's better to be up to date with 2.6.32. Although no conflicting changes were made in between, it gives benchmarking results closer to the lastest kernel behaviour.
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c2319
1 files changed, 1261 insertions, 1058 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72a2b9c28e9f..94627c4cc193 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -32,12 +32,12 @@
32#include "locking.h" 32#include "locking.h"
33#include "free-space-cache.h" 33#include "free-space-cache.h"
34 34
35static int update_reserved_extents(struct btrfs_root *root,
36 u64 bytenr, u64 num, int reserve);
37static int update_block_group(struct btrfs_trans_handle *trans, 35static int update_block_group(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root, 36 struct btrfs_root *root,
39 u64 bytenr, u64 num_bytes, int alloc, 37 u64 bytenr, u64 num_bytes, int alloc,
40 int mark_free); 38 int mark_free);
39static int update_reserved_extents(struct btrfs_block_group_cache *cache,
40 u64 num_bytes, int reserve);
41static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42 struct btrfs_root *root, 42 struct btrfs_root *root,
43 u64 bytenr, u64 num_bytes, u64 parent, 43 u64 bytenr, u64 num_bytes, u64 parent,
@@ -57,10 +57,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
57 u64 parent, u64 root_objectid, 57 u64 parent, u64 root_objectid,
58 u64 flags, struct btrfs_disk_key *key, 58 u64 flags, struct btrfs_disk_key *key,
59 int level, struct btrfs_key *ins); 59 int level, struct btrfs_key *ins);
60
61static int do_chunk_alloc(struct btrfs_trans_handle *trans, 60static int do_chunk_alloc(struct btrfs_trans_handle *trans,
62 struct btrfs_root *extent_root, u64 alloc_bytes, 61 struct btrfs_root *extent_root, u64 alloc_bytes,
63 u64 flags, int force); 62 u64 flags, int force);
63static int pin_down_bytes(struct btrfs_trans_handle *trans,
64 struct btrfs_root *root,
65 struct btrfs_path *path,
66 u64 bytenr, u64 num_bytes,
67 int is_data, int reserved,
68 struct extent_buffer **must_clean);
69static int find_next_key(struct btrfs_path *path, int level,
70 struct btrfs_key *key);
71static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
72 int dump_block_groups);
64 73
65static noinline int 74static noinline int
66block_group_cache_done(struct btrfs_block_group_cache *cache) 75block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -153,34 +162,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
153 return ret; 162 return ret;
154} 163}
155 164
156/* 165static int add_excluded_extent(struct btrfs_root *root,
157 * We always set EXTENT_LOCKED for the super mirror extents so we don't 166 u64 start, u64 num_bytes)
158 * overwrite them, so those bits need to be unset. Also, if we are unmounting
159 * with pinned extents still sitting there because we had a block group caching,
160 * we need to clear those now, since we are done.
161 */
162void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
163{ 167{
164 u64 start, end, last = 0; 168 u64 end = start + num_bytes - 1;
165 int ret; 169 set_extent_bits(&root->fs_info->freed_extents[0],
170 start, end, EXTENT_UPTODATE, GFP_NOFS);
171 set_extent_bits(&root->fs_info->freed_extents[1],
172 start, end, EXTENT_UPTODATE, GFP_NOFS);
173 return 0;
174}
166 175
167 while (1) { 176static void free_excluded_extents(struct btrfs_root *root,
168 ret = find_first_extent_bit(&info->pinned_extents, last, 177 struct btrfs_block_group_cache *cache)
169 &start, &end, 178{
170 EXTENT_LOCKED|EXTENT_DIRTY); 179 u64 start, end;
171 if (ret)
172 break;
173 180
174 clear_extent_bits(&info->pinned_extents, start, end, 181 start = cache->key.objectid;
175 EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS); 182 end = start + cache->key.offset - 1;
176 last = end+1; 183
177 } 184 clear_extent_bits(&root->fs_info->freed_extents[0],
185 start, end, EXTENT_UPTODATE, GFP_NOFS);
186 clear_extent_bits(&root->fs_info->freed_extents[1],
187 start, end, EXTENT_UPTODATE, GFP_NOFS);
178} 188}
179 189
180static int remove_sb_from_cache(struct btrfs_root *root, 190static int exclude_super_stripes(struct btrfs_root *root,
181 struct btrfs_block_group_cache *cache) 191 struct btrfs_block_group_cache *cache)
182{ 192{
183 struct btrfs_fs_info *fs_info = root->fs_info;
184 u64 bytenr; 193 u64 bytenr;
185 u64 *logical; 194 u64 *logical;
186 int stripe_len; 195 int stripe_len;
@@ -192,17 +201,42 @@ static int remove_sb_from_cache(struct btrfs_root *root,
192 cache->key.objectid, bytenr, 201 cache->key.objectid, bytenr,
193 0, &logical, &nr, &stripe_len); 202 0, &logical, &nr, &stripe_len);
194 BUG_ON(ret); 203 BUG_ON(ret);
204
195 while (nr--) { 205 while (nr--) {
196 try_lock_extent(&fs_info->pinned_extents, 206 cache->bytes_super += stripe_len;
197 logical[nr], 207 ret = add_excluded_extent(root, logical[nr],
198 logical[nr] + stripe_len - 1, GFP_NOFS); 208 stripe_len);
209 BUG_ON(ret);
199 } 210 }
211
200 kfree(logical); 212 kfree(logical);
201 } 213 }
202
203 return 0; 214 return 0;
204} 215}
205 216
217static struct btrfs_caching_control *
218get_caching_control(struct btrfs_block_group_cache *cache)
219{
220 struct btrfs_caching_control *ctl;
221
222 spin_lock(&cache->lock);
223 if (cache->cached != BTRFS_CACHE_STARTED) {
224 spin_unlock(&cache->lock);
225 return NULL;
226 }
227
228 ctl = cache->caching_ctl;
229 atomic_inc(&ctl->count);
230 spin_unlock(&cache->lock);
231 return ctl;
232}
233
234static void put_caching_control(struct btrfs_caching_control *ctl)
235{
236 if (atomic_dec_and_test(&ctl->count))
237 kfree(ctl);
238}
239
206/* 240/*
207 * this is only called by cache_block_group, since we could have freed extents 241 * this is only called by cache_block_group, since we could have freed extents
208 * we need to check the pinned_extents for any extents that can't be used yet 242 * we need to check the pinned_extents for any extents that can't be used yet
@@ -215,9 +249,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
215 int ret; 249 int ret;
216 250
217 while (start < end) { 251 while (start < end) {
218 ret = find_first_extent_bit(&info->pinned_extents, start, 252 ret = find_first_extent_bit(info->pinned_extents, start,
219 &extent_start, &extent_end, 253 &extent_start, &extent_end,
220 EXTENT_DIRTY|EXTENT_LOCKED); 254 EXTENT_DIRTY | EXTENT_UPTODATE);
221 if (ret) 255 if (ret)
222 break; 256 break;
223 257
@@ -249,22 +283,27 @@ static int caching_kthread(void *data)
249{ 283{
250 struct btrfs_block_group_cache *block_group = data; 284 struct btrfs_block_group_cache *block_group = data;
251 struct btrfs_fs_info *fs_info = block_group->fs_info; 285 struct btrfs_fs_info *fs_info = block_group->fs_info;
252 u64 last = 0; 286 struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
287 struct btrfs_root *extent_root = fs_info->extent_root;
253 struct btrfs_path *path; 288 struct btrfs_path *path;
254 int ret = 0;
255 struct btrfs_key key;
256 struct extent_buffer *leaf; 289 struct extent_buffer *leaf;
257 int slot; 290 struct btrfs_key key;
258 u64 total_found = 0; 291 u64 total_found = 0;
259 292 u64 last = 0;
260 BUG_ON(!fs_info); 293 u32 nritems;
294 int ret = 0;
261 295
262 path = btrfs_alloc_path(); 296 path = btrfs_alloc_path();
263 if (!path) 297 if (!path)
264 return -ENOMEM; 298 return -ENOMEM;
265 299
266 atomic_inc(&block_group->space_info->caching_threads); 300 exclude_super_stripes(extent_root, block_group);
301 spin_lock(&block_group->space_info->lock);
302 block_group->space_info->bytes_super += block_group->bytes_super;
303 spin_unlock(&block_group->space_info->lock);
304
267 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 305 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
306
268 /* 307 /*
269 * We don't want to deadlock with somebody trying to allocate a new 308 * We don't want to deadlock with somebody trying to allocate a new
270 * extent for the extent root while also trying to search the extent 309 * extent for the extent root while also trying to search the extent
@@ -277,74 +316,64 @@ static int caching_kthread(void *data)
277 316
278 key.objectid = last; 317 key.objectid = last;
279 key.offset = 0; 318 key.offset = 0;
280 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 319 key.type = BTRFS_EXTENT_ITEM_KEY;
281again: 320again:
321 mutex_lock(&caching_ctl->mutex);
282 /* need to make sure the commit_root doesn't disappear */ 322 /* need to make sure the commit_root doesn't disappear */
283 down_read(&fs_info->extent_commit_sem); 323 down_read(&fs_info->extent_commit_sem);
284 324
285 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); 325 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
286 if (ret < 0) 326 if (ret < 0)
287 goto err; 327 goto err;
288 328
329 leaf = path->nodes[0];
330 nritems = btrfs_header_nritems(leaf);
331
289 while (1) { 332 while (1) {
290 smp_mb(); 333 smp_mb();
291 if (block_group->fs_info->closing > 1) { 334 if (fs_info->closing > 1) {
292 last = (u64)-1; 335 last = (u64)-1;
293 break; 336 break;
294 } 337 }
295 338
296 leaf = path->nodes[0]; 339 if (path->slots[0] < nritems) {
297 slot = path->slots[0]; 340 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
298 if (slot >= btrfs_header_nritems(leaf)) { 341 } else {
299 ret = btrfs_next_leaf(fs_info->extent_root, path); 342 ret = find_next_key(path, 0, &key);
300 if (ret < 0) 343 if (ret)
301 goto err;
302 else if (ret)
303 break; 344 break;
304 345
305 if (need_resched() || 346 caching_ctl->progress = last;
306 btrfs_transaction_in_commit(fs_info)) { 347 btrfs_release_path(extent_root, path);
307 leaf = path->nodes[0]; 348 up_read(&fs_info->extent_commit_sem);
308 349 mutex_unlock(&caching_ctl->mutex);
309 /* this shouldn't happen, but if the 350 if (btrfs_transaction_in_commit(fs_info))
310 * leaf is empty just move on.
311 */
312 if (btrfs_header_nritems(leaf) == 0)
313 break;
314 /*
315 * we need to copy the key out so that
316 * we are sure the next search advances
317 * us forward in the btree.
318 */
319 btrfs_item_key_to_cpu(leaf, &key, 0);
320 btrfs_release_path(fs_info->extent_root, path);
321 up_read(&fs_info->extent_commit_sem);
322 schedule_timeout(1); 351 schedule_timeout(1);
323 goto again; 352 else
324 } 353 cond_resched();
354 goto again;
355 }
325 356
357 if (key.objectid < block_group->key.objectid) {
358 path->slots[0]++;
326 continue; 359 continue;
327 } 360 }
328 btrfs_item_key_to_cpu(leaf, &key, slot);
329 if (key.objectid < block_group->key.objectid)
330 goto next;
331 361
332 if (key.objectid >= block_group->key.objectid + 362 if (key.objectid >= block_group->key.objectid +
333 block_group->key.offset) 363 block_group->key.offset)
334 break; 364 break;
335 365
336 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { 366 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
337 total_found += add_new_free_space(block_group, 367 total_found += add_new_free_space(block_group,
338 fs_info, last, 368 fs_info, last,
339 key.objectid); 369 key.objectid);
340 last = key.objectid + key.offset; 370 last = key.objectid + key.offset;
341 }
342 371
343 if (total_found > (1024 * 1024 * 2)) { 372 if (total_found > (1024 * 1024 * 2)) {
344 total_found = 0; 373 total_found = 0;
345 wake_up(&block_group->caching_q); 374 wake_up(&caching_ctl->wait);
375 }
346 } 376 }
347next:
348 path->slots[0]++; 377 path->slots[0]++;
349 } 378 }
350 ret = 0; 379 ret = 0;
@@ -352,33 +381,65 @@ next:
352 total_found += add_new_free_space(block_group, fs_info, last, 381 total_found += add_new_free_space(block_group, fs_info, last,
353 block_group->key.objectid + 382 block_group->key.objectid +
354 block_group->key.offset); 383 block_group->key.offset);
384 caching_ctl->progress = (u64)-1;
355 385
356 spin_lock(&block_group->lock); 386 spin_lock(&block_group->lock);
387 block_group->caching_ctl = NULL;
357 block_group->cached = BTRFS_CACHE_FINISHED; 388 block_group->cached = BTRFS_CACHE_FINISHED;
358 spin_unlock(&block_group->lock); 389 spin_unlock(&block_group->lock);
359 390
360err: 391err:
361 btrfs_free_path(path); 392 btrfs_free_path(path);
362 up_read(&fs_info->extent_commit_sem); 393 up_read(&fs_info->extent_commit_sem);
363 atomic_dec(&block_group->space_info->caching_threads);
364 wake_up(&block_group->caching_q);
365 394
395 free_excluded_extents(extent_root, block_group);
396
397 mutex_unlock(&caching_ctl->mutex);
398 wake_up(&caching_ctl->wait);
399
400 put_caching_control(caching_ctl);
401 atomic_dec(&block_group->space_info->caching_threads);
366 return 0; 402 return 0;
367} 403}
368 404
369static int cache_block_group(struct btrfs_block_group_cache *cache) 405static int cache_block_group(struct btrfs_block_group_cache *cache)
370{ 406{
407 struct btrfs_fs_info *fs_info = cache->fs_info;
408 struct btrfs_caching_control *caching_ctl;
371 struct task_struct *tsk; 409 struct task_struct *tsk;
372 int ret = 0; 410 int ret = 0;
373 411
412 smp_mb();
413 if (cache->cached != BTRFS_CACHE_NO)
414 return 0;
415
416 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
417 BUG_ON(!caching_ctl);
418
419 INIT_LIST_HEAD(&caching_ctl->list);
420 mutex_init(&caching_ctl->mutex);
421 init_waitqueue_head(&caching_ctl->wait);
422 caching_ctl->block_group = cache;
423 caching_ctl->progress = cache->key.objectid;
424 /* one for caching kthread, one for caching block group list */
425 atomic_set(&caching_ctl->count, 2);
426
374 spin_lock(&cache->lock); 427 spin_lock(&cache->lock);
375 if (cache->cached != BTRFS_CACHE_NO) { 428 if (cache->cached != BTRFS_CACHE_NO) {
376 spin_unlock(&cache->lock); 429 spin_unlock(&cache->lock);
377 return ret; 430 kfree(caching_ctl);
431 return 0;
378 } 432 }
433 cache->caching_ctl = caching_ctl;
379 cache->cached = BTRFS_CACHE_STARTED; 434 cache->cached = BTRFS_CACHE_STARTED;
380 spin_unlock(&cache->lock); 435 spin_unlock(&cache->lock);
381 436
437 down_write(&fs_info->extent_commit_sem);
438 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
439 up_write(&fs_info->extent_commit_sem);
440
441 atomic_inc(&cache->space_info->caching_threads);
442
382 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", 443 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
383 cache->key.objectid); 444 cache->key.objectid);
384 if (IS_ERR(tsk)) { 445 if (IS_ERR(tsk)) {
@@ -1507,22 +1568,23 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1507 return ret; 1568 return ret;
1508} 1569}
1509 1570
1510#ifdef BIO_RW_DISCARD
1511static void btrfs_issue_discard(struct block_device *bdev, 1571static void btrfs_issue_discard(struct block_device *bdev,
1512 u64 start, u64 len) 1572 u64 start, u64 len)
1513{ 1573{
1514 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); 1574 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1575 DISCARD_FL_BARRIER);
1515} 1576}
1516#endif
1517 1577
1518static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1578static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1519 u64 num_bytes) 1579 u64 num_bytes)
1520{ 1580{
1521#ifdef BIO_RW_DISCARD
1522 int ret; 1581 int ret;
1523 u64 map_length = num_bytes; 1582 u64 map_length = num_bytes;
1524 struct btrfs_multi_bio *multi = NULL; 1583 struct btrfs_multi_bio *multi = NULL;
1525 1584
1585 if (!btrfs_test_opt(root, DISCARD))
1586 return 0;
1587
1526 /* Tell the block device(s) that the sectors can be discarded */ 1588 /* Tell the block device(s) that the sectors can be discarded */
1527 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, 1589 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1528 bytenr, &map_length, &multi, 0); 1590 bytenr, &map_length, &multi, 0);
@@ -1542,9 +1604,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1542 } 1604 }
1543 1605
1544 return ret; 1606 return ret;
1545#else
1546 return 0;
1547#endif
1548} 1607}
1549 1608
1550int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1609int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@@ -1656,7 +1715,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1656 parent, ref_root, flags, 1715 parent, ref_root, flags,
1657 ref->objectid, ref->offset, 1716 ref->objectid, ref->offset,
1658 &ins, node->ref_mod); 1717 &ins, node->ref_mod);
1659 update_reserved_extents(root, ins.objectid, ins.offset, 0);
1660 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1718 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1661 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 1719 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1662 node->num_bytes, parent, 1720 node->num_bytes, parent,
@@ -1782,7 +1840,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1782 extent_op->flags_to_set, 1840 extent_op->flags_to_set,
1783 &extent_op->key, 1841 &extent_op->key,
1784 ref->level, &ins); 1842 ref->level, &ins);
1785 update_reserved_extents(root, ins.objectid, ins.offset, 0);
1786 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1843 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1787 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 1844 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1788 node->num_bytes, parent, ref_root, 1845 node->num_bytes, parent, ref_root,
@@ -1817,16 +1874,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1817 BUG_ON(extent_op); 1874 BUG_ON(extent_op);
1818 head = btrfs_delayed_node_to_head(node); 1875 head = btrfs_delayed_node_to_head(node);
1819 if (insert_reserved) { 1876 if (insert_reserved) {
1877 int mark_free = 0;
1878 struct extent_buffer *must_clean = NULL;
1879
1880 ret = pin_down_bytes(trans, root, NULL,
1881 node->bytenr, node->num_bytes,
1882 head->is_data, 1, &must_clean);
1883 if (ret > 0)
1884 mark_free = 1;
1885
1886 if (must_clean) {
1887 clean_tree_block(NULL, root, must_clean);
1888 btrfs_tree_unlock(must_clean);
1889 free_extent_buffer(must_clean);
1890 }
1820 if (head->is_data) { 1891 if (head->is_data) {
1821 ret = btrfs_del_csums(trans, root, 1892 ret = btrfs_del_csums(trans, root,
1822 node->bytenr, 1893 node->bytenr,
1823 node->num_bytes); 1894 node->num_bytes);
1824 BUG_ON(ret); 1895 BUG_ON(ret);
1825 } 1896 }
1826 btrfs_update_pinned_extents(root, node->bytenr, 1897 if (mark_free) {
1827 node->num_bytes, 1); 1898 ret = btrfs_free_reserved_extent(root,
1828 update_reserved_extents(root, node->bytenr, 1899 node->bytenr,
1829 node->num_bytes, 0); 1900 node->num_bytes);
1901 BUG_ON(ret);
1902 }
1830 } 1903 }
1831 mutex_unlock(&head->mutex); 1904 mutex_unlock(&head->mutex);
1832 return 0; 1905 return 0;
@@ -2691,60 +2764,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2691 alloc_target); 2764 alloc_target);
2692} 2765}
2693 2766
2767static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2768{
2769 u64 num_bytes;
2770 int level;
2771
2772 level = BTRFS_MAX_LEVEL - 2;
2773 /*
2774 * NOTE: these calculations are absolutely the worst possible case.
2775 * This assumes that _every_ item we insert will require a new leaf, and
2776 * that the tree has grown to its maximum level size.
2777 */
2778
2779 /*
2780 * for every item we insert we could insert both an extent item and a
2781 * extent ref item. Then for ever item we insert, we will need to cow
2782 * both the original leaf, plus the leaf to the left and right of it.
2783 *
2784 * Unless we are talking about the extent root, then we just want the
2785 * number of items * 2, since we just need the extent item plus its ref.
2786 */
2787 if (root == root->fs_info->extent_root)
2788 num_bytes = num_items * 2;
2789 else
2790 num_bytes = (num_items + (2 * num_items)) * 3;
2791
2792 /*
2793 * num_bytes is total number of leaves we could need times the leaf
2794 * size, and then for every leaf we could end up cow'ing 2 nodes per
2795 * level, down to the leaf level.
2796 */
2797 num_bytes = (num_bytes * root->leafsize) +
2798 (num_bytes * (level * 2)) * root->nodesize;
2799
2800 return num_bytes;
2801}
2802
2694/* 2803/*
2695 * for now this just makes sure we have at least 5% of our metadata space free 2804 * Unreserve metadata space for delalloc. If we have less reserved credits than
2696 * for use. 2805 * we have extents, this function does nothing.
2697 */ 2806 */
2698int btrfs_check_metadata_free_space(struct btrfs_root *root) 2807int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2808 struct inode *inode, int num_items)
2699{ 2809{
2700 struct btrfs_fs_info *info = root->fs_info; 2810 struct btrfs_fs_info *info = root->fs_info;
2701 struct btrfs_space_info *meta_sinfo; 2811 struct btrfs_space_info *meta_sinfo;
2702 u64 alloc_target, thresh; 2812 u64 num_bytes;
2703 int committed = 0, ret; 2813 u64 alloc_target;
2814 bool bug = false;
2704 2815
2705 /* get the space info for where the metadata will live */ 2816 /* get the space info for where the metadata will live */
2706 alloc_target = btrfs_get_alloc_profile(root, 0); 2817 alloc_target = btrfs_get_alloc_profile(root, 0);
2707 meta_sinfo = __find_space_info(info, alloc_target); 2818 meta_sinfo = __find_space_info(info, alloc_target);
2708 2819
2709again: 2820 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2821 num_items);
2822
2710 spin_lock(&meta_sinfo->lock); 2823 spin_lock(&meta_sinfo->lock);
2711 if (!meta_sinfo->full) 2824 spin_lock(&BTRFS_I(inode)->accounting_lock);
2712 thresh = meta_sinfo->total_bytes * 80; 2825 if (BTRFS_I(inode)->reserved_extents <=
2713 else 2826 BTRFS_I(inode)->outstanding_extents) {
2714 thresh = meta_sinfo->total_bytes * 95; 2827 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2828 spin_unlock(&meta_sinfo->lock);
2829 return 0;
2830 }
2831 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2832
2833 BTRFS_I(inode)->reserved_extents--;
2834 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2835
2836 if (meta_sinfo->bytes_delalloc < num_bytes) {
2837 bug = true;
2838 meta_sinfo->bytes_delalloc = 0;
2839 } else {
2840 meta_sinfo->bytes_delalloc -= num_bytes;
2841 }
2842 spin_unlock(&meta_sinfo->lock);
2843
2844 BUG_ON(bug);
2845
2846 return 0;
2847}
2715 2848
2849static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2850{
2851 u64 thresh;
2852
2853 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2854 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2855 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2856 meta_sinfo->bytes_may_use;
2857
2858 thresh = meta_sinfo->total_bytes - thresh;
2859 thresh *= 80;
2716 do_div(thresh, 100); 2860 do_div(thresh, 100);
2861 if (thresh <= meta_sinfo->bytes_delalloc)
2862 meta_sinfo->force_delalloc = 1;
2863 else
2864 meta_sinfo->force_delalloc = 0;
2865}
2717 2866
2718 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2867struct async_flush {
2719 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) { 2868 struct btrfs_root *root;
2720 struct btrfs_trans_handle *trans; 2869 struct btrfs_space_info *info;
2721 if (!meta_sinfo->full) { 2870 struct btrfs_work work;
2722 meta_sinfo->force_alloc = 1; 2871};
2723 spin_unlock(&meta_sinfo->lock);
2724 2872
2725 trans = btrfs_start_transaction(root, 1); 2873static noinline void flush_delalloc_async(struct btrfs_work *work)
2726 if (!trans) 2874{
2727 return -ENOMEM; 2875 struct async_flush *async;
2876 struct btrfs_root *root;
2877 struct btrfs_space_info *info;
2728 2878
2729 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2879 async = container_of(work, struct async_flush, work);
2730 2 * 1024 * 1024, alloc_target, 0); 2880 root = async->root;
2731 btrfs_end_transaction(trans, root); 2881 info = async->info;
2882
2883 btrfs_start_delalloc_inodes(root);
2884 wake_up(&info->flush_wait);
2885 btrfs_wait_ordered_extents(root, 0);
2886
2887 spin_lock(&info->lock);
2888 info->flushing = 0;
2889 spin_unlock(&info->lock);
2890 wake_up(&info->flush_wait);
2891
2892 kfree(async);
2893}
2894
2895static void wait_on_flush(struct btrfs_space_info *info)
2896{
2897 DEFINE_WAIT(wait);
2898 u64 used;
2899
2900 while (1) {
2901 prepare_to_wait(&info->flush_wait, &wait,
2902 TASK_UNINTERRUPTIBLE);
2903 spin_lock(&info->lock);
2904 if (!info->flushing) {
2905 spin_unlock(&info->lock);
2906 break;
2907 }
2908
2909 used = info->bytes_used + info->bytes_reserved +
2910 info->bytes_pinned + info->bytes_readonly +
2911 info->bytes_super + info->bytes_root +
2912 info->bytes_may_use + info->bytes_delalloc;
2913 if (used < info->total_bytes) {
2914 spin_unlock(&info->lock);
2915 break;
2916 }
2917 spin_unlock(&info->lock);
2918 schedule();
2919 }
2920 finish_wait(&info->flush_wait, &wait);
2921}
2922
2923static void flush_delalloc(struct btrfs_root *root,
2924 struct btrfs_space_info *info)
2925{
2926 struct async_flush *async;
2927 bool wait = false;
2928
2929 spin_lock(&info->lock);
2930
2931 if (!info->flushing) {
2932 info->flushing = 1;
2933 init_waitqueue_head(&info->flush_wait);
2934 } else {
2935 wait = true;
2936 }
2937
2938 spin_unlock(&info->lock);
2939
2940 if (wait) {
2941 wait_on_flush(info);
2942 return;
2943 }
2944
2945 async = kzalloc(sizeof(*async), GFP_NOFS);
2946 if (!async)
2947 goto flush;
2948
2949 async->root = root;
2950 async->info = info;
2951 async->work.func = flush_delalloc_async;
2952
2953 btrfs_queue_worker(&root->fs_info->enospc_workers,
2954 &async->work);
2955 wait_on_flush(info);
2956 return;
2957
2958flush:
2959 btrfs_start_delalloc_inodes(root);
2960 btrfs_wait_ordered_extents(root, 0);
2961
2962 spin_lock(&info->lock);
2963 info->flushing = 0;
2964 spin_unlock(&info->lock);
2965 wake_up(&info->flush_wait);
2966}
2967
2968static int maybe_allocate_chunk(struct btrfs_root *root,
2969 struct btrfs_space_info *info)
2970{
2971 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2972 struct btrfs_trans_handle *trans;
2973 bool wait = false;
2974 int ret = 0;
2975 u64 min_metadata;
2976 u64 free_space;
2977
2978 free_space = btrfs_super_total_bytes(disk_super);
2979 /*
2980 * we allow the metadata to grow to a max of either 10gb or 5% of the
2981 * space in the volume.
2982 */
2983 min_metadata = min((u64)10 * 1024 * 1024 * 1024,
2984 div64_u64(free_space * 5, 100));
2985 if (info->total_bytes >= min_metadata) {
2986 spin_unlock(&info->lock);
2987 return 0;
2988 }
2989
2990 if (info->full) {
2991 spin_unlock(&info->lock);
2992 return 0;
2993 }
2994
2995 if (!info->allocating_chunk) {
2996 info->force_alloc = 1;
2997 info->allocating_chunk = 1;
2998 init_waitqueue_head(&info->allocate_wait);
2999 } else {
3000 wait = true;
3001 }
3002
3003 spin_unlock(&info->lock);
3004
3005 if (wait) {
3006 wait_event(info->allocate_wait,
3007 !info->allocating_chunk);
3008 return 1;
3009 }
3010
3011 trans = btrfs_start_transaction(root, 1);
3012 if (!trans) {
3013 ret = -ENOMEM;
3014 goto out;
3015 }
3016
3017 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3018 4096 + 2 * 1024 * 1024,
3019 info->flags, 0);
3020 btrfs_end_transaction(trans, root);
3021 if (ret)
3022 goto out;
3023out:
3024 spin_lock(&info->lock);
3025 info->allocating_chunk = 0;
3026 spin_unlock(&info->lock);
3027 wake_up(&info->allocate_wait);
3028
3029 if (ret)
3030 return 0;
3031 return 1;
3032}
3033
3034/*
3035 * Reserve metadata space for delalloc.
3036 */
3037int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3038 struct inode *inode, int num_items)
3039{
3040 struct btrfs_fs_info *info = root->fs_info;
3041 struct btrfs_space_info *meta_sinfo;
3042 u64 num_bytes;
3043 u64 used;
3044 u64 alloc_target;
3045 int flushed = 0;
3046 int force_delalloc;
3047
3048 /* get the space info for where the metadata will live */
3049 alloc_target = btrfs_get_alloc_profile(root, 0);
3050 meta_sinfo = __find_space_info(info, alloc_target);
3051
3052 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3053 num_items);
3054again:
3055 spin_lock(&meta_sinfo->lock);
3056
3057 force_delalloc = meta_sinfo->force_delalloc;
3058
3059 if (unlikely(!meta_sinfo->bytes_root))
3060 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3061
3062 if (!flushed)
3063 meta_sinfo->bytes_delalloc += num_bytes;
3064
3065 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3066 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3067 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3068 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3069
3070 if (used > meta_sinfo->total_bytes) {
3071 flushed++;
3072
3073 if (flushed == 1) {
3074 if (maybe_allocate_chunk(root, meta_sinfo))
3075 goto again;
3076 flushed++;
3077 } else {
3078 spin_unlock(&meta_sinfo->lock);
3079 }
3080
3081 if (flushed == 2) {
3082 filemap_flush(inode->i_mapping);
3083 goto again;
3084 } else if (flushed == 3) {
3085 flush_delalloc(root, meta_sinfo);
2732 goto again; 3086 goto again;
2733 } 3087 }
3088 spin_lock(&meta_sinfo->lock);
3089 meta_sinfo->bytes_delalloc -= num_bytes;
2734 spin_unlock(&meta_sinfo->lock); 3090 spin_unlock(&meta_sinfo->lock);
3091 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3092 BTRFS_I(inode)->outstanding_extents,
3093 BTRFS_I(inode)->reserved_extents);
3094 dump_space_info(meta_sinfo, 0, 0);
3095 return -ENOSPC;
3096 }
2735 3097
2736 if (!committed) { 3098 BTRFS_I(inode)->reserved_extents++;
2737 committed = 1; 3099 check_force_delalloc(meta_sinfo);
2738 trans = btrfs_join_transaction(root, 1); 3100 spin_unlock(&meta_sinfo->lock);
2739 if (!trans) 3101
2740 return -ENOMEM; 3102 if (!flushed && force_delalloc)
2741 ret = btrfs_commit_transaction(trans, root); 3103 filemap_flush(inode->i_mapping);
2742 if (ret) 3104
2743 return ret; 3105 return 0;
3106}
3107
3108/*
3109 * unreserve num_items number of items worth of metadata space. This needs to
3110 * be paired with btrfs_reserve_metadata_space.
3111 *
3112 * NOTE: if you have the option, run this _AFTER_ you do a
3113 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3114 * oprations which will result in more used metadata, so we want to make sure we
3115 * can do that without issue.
3116 */
3117int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3118{
3119 struct btrfs_fs_info *info = root->fs_info;
3120 struct btrfs_space_info *meta_sinfo;
3121 u64 num_bytes;
3122 u64 alloc_target;
3123 bool bug = false;
3124
3125 /* get the space info for where the metadata will live */
3126 alloc_target = btrfs_get_alloc_profile(root, 0);
3127 meta_sinfo = __find_space_info(info, alloc_target);
3128
3129 num_bytes = calculate_bytes_needed(root, num_items);
3130
3131 spin_lock(&meta_sinfo->lock);
3132 if (meta_sinfo->bytes_may_use < num_bytes) {
3133 bug = true;
3134 meta_sinfo->bytes_may_use = 0;
3135 } else {
3136 meta_sinfo->bytes_may_use -= num_bytes;
3137 }
3138 spin_unlock(&meta_sinfo->lock);
3139
3140 BUG_ON(bug);
3141
3142 return 0;
3143}
3144
3145/*
3146 * Reserve some metadata space for use. We'll calculate the worste case number
3147 * of bytes that would be needed to modify num_items number of items. If we
3148 * have space, fantastic, if not, you get -ENOSPC. Please call
3149 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3150 * items you reserved, since whatever metadata you needed should have already
3151 * been allocated.
3152 *
3153 * This will commit the transaction to make more space if we don't have enough
3154 * metadata space. THe only time we don't do this is if we're reserving space
3155 * inside of a transaction, then we will just return -ENOSPC and it is the
3156 * callers responsibility to handle it properly.
3157 */
3158int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3159{
3160 struct btrfs_fs_info *info = root->fs_info;
3161 struct btrfs_space_info *meta_sinfo;
3162 u64 num_bytes;
3163 u64 used;
3164 u64 alloc_target;
3165 int retries = 0;
3166
3167 /* get the space info for where the metadata will live */
3168 alloc_target = btrfs_get_alloc_profile(root, 0);
3169 meta_sinfo = __find_space_info(info, alloc_target);
3170
3171 num_bytes = calculate_bytes_needed(root, num_items);
3172again:
3173 spin_lock(&meta_sinfo->lock);
3174
3175 if (unlikely(!meta_sinfo->bytes_root))
3176 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3177
3178 if (!retries)
3179 meta_sinfo->bytes_may_use += num_bytes;
3180
3181 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3182 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3183 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3184 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3185
3186 if (used > meta_sinfo->total_bytes) {
3187 retries++;
3188 if (retries == 1) {
3189 if (maybe_allocate_chunk(root, meta_sinfo))
3190 goto again;
3191 retries++;
3192 } else {
3193 spin_unlock(&meta_sinfo->lock);
3194 }
3195
3196 if (retries == 2) {
3197 flush_delalloc(root, meta_sinfo);
2744 goto again; 3198 goto again;
2745 } 3199 }
3200 spin_lock(&meta_sinfo->lock);
3201 meta_sinfo->bytes_may_use -= num_bytes;
3202 spin_unlock(&meta_sinfo->lock);
3203
3204 dump_space_info(meta_sinfo, 0, 0);
2746 return -ENOSPC; 3205 return -ENOSPC;
2747 } 3206 }
3207
3208 check_force_delalloc(meta_sinfo);
2748 spin_unlock(&meta_sinfo->lock); 3209 spin_unlock(&meta_sinfo->lock);
2749 3210
2750 return 0; 3211 return 0;
@@ -2764,13 +3225,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2764 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3225 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2765 3226
2766 data_sinfo = BTRFS_I(inode)->space_info; 3227 data_sinfo = BTRFS_I(inode)->space_info;
3228 if (!data_sinfo)
3229 goto alloc;
3230
2767again: 3231again:
2768 /* make sure we have enough space to handle the data first */ 3232 /* make sure we have enough space to handle the data first */
2769 spin_lock(&data_sinfo->lock); 3233 spin_lock(&data_sinfo->lock);
2770 if (data_sinfo->total_bytes - data_sinfo->bytes_used - 3234 if (data_sinfo->total_bytes - data_sinfo->bytes_used -
2771 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - 3235 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
2772 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - 3236 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
2773 data_sinfo->bytes_may_use < bytes) { 3237 data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
2774 struct btrfs_trans_handle *trans; 3238 struct btrfs_trans_handle *trans;
2775 3239
2776 /* 3240 /*
@@ -2782,7 +3246,7 @@ again:
2782 3246
2783 data_sinfo->force_alloc = 1; 3247 data_sinfo->force_alloc = 1;
2784 spin_unlock(&data_sinfo->lock); 3248 spin_unlock(&data_sinfo->lock);
2785 3249alloc:
2786 alloc_target = btrfs_get_alloc_profile(root, 1); 3250 alloc_target = btrfs_get_alloc_profile(root, 1);
2787 trans = btrfs_start_transaction(root, 1); 3251 trans = btrfs_start_transaction(root, 1);
2788 if (!trans) 3252 if (!trans)
@@ -2794,12 +3258,17 @@ again:
2794 btrfs_end_transaction(trans, root); 3258 btrfs_end_transaction(trans, root);
2795 if (ret) 3259 if (ret)
2796 return ret; 3260 return ret;
3261
3262 if (!data_sinfo) {
3263 btrfs_set_inode_space_info(root, inode);
3264 data_sinfo = BTRFS_I(inode)->space_info;
3265 }
2797 goto again; 3266 goto again;
2798 } 3267 }
2799 spin_unlock(&data_sinfo->lock); 3268 spin_unlock(&data_sinfo->lock);
2800 3269
2801 /* commit the current transaction and try again */ 3270 /* commit the current transaction and try again */
2802 if (!committed) { 3271 if (!committed && !root->fs_info->open_ioctl_trans) {
2803 committed = 1; 3272 committed = 1;
2804 trans = btrfs_join_transaction(root, 1); 3273 trans = btrfs_join_transaction(root, 1);
2805 if (!trans) 3274 if (!trans)
@@ -2827,7 +3296,7 @@ again:
2827 BTRFS_I(inode)->reserved_bytes += bytes; 3296 BTRFS_I(inode)->reserved_bytes += bytes;
2828 spin_unlock(&data_sinfo->lock); 3297 spin_unlock(&data_sinfo->lock);
2829 3298
2830 return btrfs_check_metadata_free_space(root); 3299 return 0;
2831} 3300}
2832 3301
2833/* 3302/*
@@ -2926,17 +3395,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2926 BUG_ON(!space_info); 3395 BUG_ON(!space_info);
2927 3396
2928 spin_lock(&space_info->lock); 3397 spin_lock(&space_info->lock);
2929 if (space_info->force_alloc) { 3398 if (space_info->force_alloc)
2930 force = 1; 3399 force = 1;
2931 space_info->force_alloc = 0;
2932 }
2933 if (space_info->full) { 3400 if (space_info->full) {
2934 spin_unlock(&space_info->lock); 3401 spin_unlock(&space_info->lock);
2935 goto out; 3402 goto out;
2936 } 3403 }
2937 3404
2938 thresh = space_info->total_bytes - space_info->bytes_readonly; 3405 thresh = space_info->total_bytes - space_info->bytes_readonly;
2939 thresh = div_factor(thresh, 6); 3406 thresh = div_factor(thresh, 8);
2940 if (!force && 3407 if (!force &&
2941 (space_info->bytes_used + space_info->bytes_pinned + 3408 (space_info->bytes_used + space_info->bytes_pinned +
2942 space_info->bytes_reserved + alloc_bytes) < thresh) { 3409 space_info->bytes_reserved + alloc_bytes) < thresh) {
@@ -2950,7 +3417,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2950 * we keep a reasonable number of metadata chunks allocated in the 3417 * we keep a reasonable number of metadata chunks allocated in the
2951 * FS as well. 3418 * FS as well.
2952 */ 3419 */
2953 if (flags & BTRFS_BLOCK_GROUP_DATA) { 3420 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
2954 fs_info->data_chunk_allocations++; 3421 fs_info->data_chunk_allocations++;
2955 if (!(fs_info->data_chunk_allocations % 3422 if (!(fs_info->data_chunk_allocations %
2956 fs_info->metadata_ratio)) 3423 fs_info->metadata_ratio))
@@ -2958,8 +3425,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2958 } 3425 }
2959 3426
2960 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3427 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3428 spin_lock(&space_info->lock);
2961 if (ret) 3429 if (ret)
2962 space_info->full = 1; 3430 space_info->full = 1;
3431 space_info->force_alloc = 0;
3432 spin_unlock(&space_info->lock);
2963out: 3433out:
2964 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3434 mutex_unlock(&extent_root->fs_info->chunk_mutex);
2965 return ret; 3435 return ret;
@@ -3008,10 +3478,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3008 num_bytes = min(total, cache->key.offset - byte_in_group); 3478 num_bytes = min(total, cache->key.offset - byte_in_group);
3009 if (alloc) { 3479 if (alloc) {
3010 old_val += num_bytes; 3480 old_val += num_bytes;
3481 btrfs_set_block_group_used(&cache->item, old_val);
3482 cache->reserved -= num_bytes;
3011 cache->space_info->bytes_used += num_bytes; 3483 cache->space_info->bytes_used += num_bytes;
3484 cache->space_info->bytes_reserved -= num_bytes;
3012 if (cache->ro) 3485 if (cache->ro)
3013 cache->space_info->bytes_readonly -= num_bytes; 3486 cache->space_info->bytes_readonly -= num_bytes;
3014 btrfs_set_block_group_used(&cache->item, old_val);
3015 spin_unlock(&cache->lock); 3487 spin_unlock(&cache->lock);
3016 spin_unlock(&cache->space_info->lock); 3488 spin_unlock(&cache->space_info->lock);
3017 } else { 3489 } else {
@@ -3056,127 +3528,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3056 return bytenr; 3528 return bytenr;
3057} 3529}
3058 3530
3059int btrfs_update_pinned_extents(struct btrfs_root *root, 3531/*
3060 u64 bytenr, u64 num, int pin) 3532 * this function must be called within transaction
3533 */
3534int btrfs_pin_extent(struct btrfs_root *root,
3535 u64 bytenr, u64 num_bytes, int reserved)
3061{ 3536{
3062 u64 len;
3063 struct btrfs_block_group_cache *cache;
3064 struct btrfs_fs_info *fs_info = root->fs_info; 3537 struct btrfs_fs_info *fs_info = root->fs_info;
3538 struct btrfs_block_group_cache *cache;
3065 3539
3066 if (pin) 3540 cache = btrfs_lookup_block_group(fs_info, bytenr);
3067 set_extent_dirty(&fs_info->pinned_extents, 3541 BUG_ON(!cache);
3068 bytenr, bytenr + num - 1, GFP_NOFS);
3069
3070 while (num > 0) {
3071 cache = btrfs_lookup_block_group(fs_info, bytenr);
3072 BUG_ON(!cache);
3073 len = min(num, cache->key.offset -
3074 (bytenr - cache->key.objectid));
3075 if (pin) {
3076 spin_lock(&cache->space_info->lock);
3077 spin_lock(&cache->lock);
3078 cache->pinned += len;
3079 cache->space_info->bytes_pinned += len;
3080 spin_unlock(&cache->lock);
3081 spin_unlock(&cache->space_info->lock);
3082 fs_info->total_pinned += len;
3083 } else {
3084 int unpin = 0;
3085 3542
3086 /* 3543 spin_lock(&cache->space_info->lock);
3087 * in order to not race with the block group caching, we 3544 spin_lock(&cache->lock);
3088 * only want to unpin the extent if we are cached. If 3545 cache->pinned += num_bytes;
3089 * we aren't cached, we want to start async caching this 3546 cache->space_info->bytes_pinned += num_bytes;
3090 * block group so we can free the extent the next time 3547 if (reserved) {
3091 * around. 3548 cache->reserved -= num_bytes;
3092 */ 3549 cache->space_info->bytes_reserved -= num_bytes;
3093 spin_lock(&cache->space_info->lock); 3550 }
3094 spin_lock(&cache->lock); 3551 spin_unlock(&cache->lock);
3095 unpin = (cache->cached == BTRFS_CACHE_FINISHED); 3552 spin_unlock(&cache->space_info->lock);
3096 if (likely(unpin)) {
3097 cache->pinned -= len;
3098 cache->space_info->bytes_pinned -= len;
3099 fs_info->total_pinned -= len;
3100 }
3101 spin_unlock(&cache->lock);
3102 spin_unlock(&cache->space_info->lock);
3103 3553
3104 if (likely(unpin)) 3554 btrfs_put_block_group(cache);
3105 clear_extent_dirty(&fs_info->pinned_extents,
3106 bytenr, bytenr + len -1,
3107 GFP_NOFS);
3108 else
3109 cache_block_group(cache);
3110 3555
3111 if (unpin) 3556 set_extent_dirty(fs_info->pinned_extents,
3112 btrfs_add_free_space(cache, bytenr, len); 3557 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
3113 } 3558 return 0;
3114 btrfs_put_block_group(cache); 3559}
3115 bytenr += len; 3560
3116 num -= len; 3561static int update_reserved_extents(struct btrfs_block_group_cache *cache,
3562 u64 num_bytes, int reserve)
3563{
3564 spin_lock(&cache->space_info->lock);
3565 spin_lock(&cache->lock);
3566 if (reserve) {
3567 cache->reserved += num_bytes;
3568 cache->space_info->bytes_reserved += num_bytes;
3569 } else {
3570 cache->reserved -= num_bytes;
3571 cache->space_info->bytes_reserved -= num_bytes;
3117 } 3572 }
3573 spin_unlock(&cache->lock);
3574 spin_unlock(&cache->space_info->lock);
3118 return 0; 3575 return 0;
3119} 3576}
3120 3577
3121static int update_reserved_extents(struct btrfs_root *root, 3578int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3122 u64 bytenr, u64 num, int reserve) 3579 struct btrfs_root *root)
3123{ 3580{
3124 u64 len;
3125 struct btrfs_block_group_cache *cache;
3126 struct btrfs_fs_info *fs_info = root->fs_info; 3581 struct btrfs_fs_info *fs_info = root->fs_info;
3582 struct btrfs_caching_control *next;
3583 struct btrfs_caching_control *caching_ctl;
3584 struct btrfs_block_group_cache *cache;
3127 3585
3128 while (num > 0) { 3586 down_write(&fs_info->extent_commit_sem);
3129 cache = btrfs_lookup_block_group(fs_info, bytenr);
3130 BUG_ON(!cache);
3131 len = min(num, cache->key.offset -
3132 (bytenr - cache->key.objectid));
3133 3587
3134 spin_lock(&cache->space_info->lock); 3588 list_for_each_entry_safe(caching_ctl, next,
3135 spin_lock(&cache->lock); 3589 &fs_info->caching_block_groups, list) {
3136 if (reserve) { 3590 cache = caching_ctl->block_group;
3137 cache->reserved += len; 3591 if (block_group_cache_done(cache)) {
3138 cache->space_info->bytes_reserved += len; 3592 cache->last_byte_to_unpin = (u64)-1;
3593 list_del_init(&caching_ctl->list);
3594 put_caching_control(caching_ctl);
3139 } else { 3595 } else {
3140 cache->reserved -= len; 3596 cache->last_byte_to_unpin = caching_ctl->progress;
3141 cache->space_info->bytes_reserved -= len;
3142 } 3597 }
3143 spin_unlock(&cache->lock);
3144 spin_unlock(&cache->space_info->lock);
3145 btrfs_put_block_group(cache);
3146 bytenr += len;
3147 num -= len;
3148 } 3598 }
3599
3600 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3601 fs_info->pinned_extents = &fs_info->freed_extents[1];
3602 else
3603 fs_info->pinned_extents = &fs_info->freed_extents[0];
3604
3605 up_write(&fs_info->extent_commit_sem);
3149 return 0; 3606 return 0;
3150} 3607}
3151 3608
3152int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) 3609static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3153{ 3610{
3154 u64 last = 0; 3611 struct btrfs_fs_info *fs_info = root->fs_info;
3155 u64 start; 3612 struct btrfs_block_group_cache *cache = NULL;
3156 u64 end; 3613 u64 len;
3157 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
3158 int ret;
3159 3614
3160 while (1) { 3615 while (start <= end) {
3161 ret = find_first_extent_bit(pinned_extents, last, 3616 if (!cache ||
3162 &start, &end, EXTENT_DIRTY); 3617 start >= cache->key.objectid + cache->key.offset) {
3163 if (ret) 3618 if (cache)
3164 break; 3619 btrfs_put_block_group(cache);
3620 cache = btrfs_lookup_block_group(fs_info, start);
3621 BUG_ON(!cache);
3622 }
3623
3624 len = cache->key.objectid + cache->key.offset - start;
3625 len = min(len, end + 1 - start);
3626
3627 if (start < cache->last_byte_to_unpin) {
3628 len = min(len, cache->last_byte_to_unpin - start);
3629 btrfs_add_free_space(cache, start, len);
3630 }
3165 3631
3166 set_extent_dirty(copy, start, end, GFP_NOFS); 3632 spin_lock(&cache->space_info->lock);
3167 last = end + 1; 3633 spin_lock(&cache->lock);
3634 cache->pinned -= len;
3635 cache->space_info->bytes_pinned -= len;
3636 spin_unlock(&cache->lock);
3637 spin_unlock(&cache->space_info->lock);
3638
3639 start += len;
3168 } 3640 }
3641
3642 if (cache)
3643 btrfs_put_block_group(cache);
3169 return 0; 3644 return 0;
3170} 3645}
3171 3646
3172int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 3647int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3173 struct btrfs_root *root, 3648 struct btrfs_root *root)
3174 struct extent_io_tree *unpin)
3175{ 3649{
3650 struct btrfs_fs_info *fs_info = root->fs_info;
3651 struct extent_io_tree *unpin;
3176 u64 start; 3652 u64 start;
3177 u64 end; 3653 u64 end;
3178 int ret; 3654 int ret;
3179 3655
3656 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3657 unpin = &fs_info->freed_extents[1];
3658 else
3659 unpin = &fs_info->freed_extents[0];
3660
3180 while (1) { 3661 while (1) {
3181 ret = find_first_extent_bit(unpin, 0, &start, &end, 3662 ret = find_first_extent_bit(unpin, 0, &start, &end,
3182 EXTENT_DIRTY); 3663 EXTENT_DIRTY);
@@ -3185,10 +3666,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3185 3666
3186 ret = btrfs_discard_extent(root, start, end + 1 - start); 3667 ret = btrfs_discard_extent(root, start, end + 1 - start);
3187 3668
3188 /* unlocks the pinned mutex */
3189 btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
3190 clear_extent_dirty(unpin, start, end, GFP_NOFS); 3669 clear_extent_dirty(unpin, start, end, GFP_NOFS);
3191 3670 unpin_extent_range(root, start, end);
3192 cond_resched(); 3671 cond_resched();
3193 } 3672 }
3194 3673
@@ -3198,7 +3677,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3198static int pin_down_bytes(struct btrfs_trans_handle *trans, 3677static int pin_down_bytes(struct btrfs_trans_handle *trans,
3199 struct btrfs_root *root, 3678 struct btrfs_root *root,
3200 struct btrfs_path *path, 3679 struct btrfs_path *path,
3201 u64 bytenr, u64 num_bytes, int is_data, 3680 u64 bytenr, u64 num_bytes,
3681 int is_data, int reserved,
3202 struct extent_buffer **must_clean) 3682 struct extent_buffer **must_clean)
3203{ 3683{
3204 int err = 0; 3684 int err = 0;
@@ -3207,6 +3687,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
3207 if (is_data) 3687 if (is_data)
3208 goto pinit; 3688 goto pinit;
3209 3689
3690 /*
3691 * discard is sloooow, and so triggering discards on
3692 * individual btree blocks isn't a good plan. Just
3693 * pin everything in discard mode.
3694 */
3695 if (btrfs_test_opt(root, DISCARD))
3696 goto pinit;
3697
3210 buf = btrfs_find_tree_block(root, bytenr, num_bytes); 3698 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3211 if (!buf) 3699 if (!buf)
3212 goto pinit; 3700 goto pinit;
@@ -3230,15 +3718,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
3230 } 3718 }
3231 free_extent_buffer(buf); 3719 free_extent_buffer(buf);
3232pinit: 3720pinit:
3233 btrfs_set_path_blocking(path); 3721 if (path)
3722 btrfs_set_path_blocking(path);
3234 /* unlocks the pinned mutex */ 3723 /* unlocks the pinned mutex */
3235 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 3724 btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3236 3725
3237 BUG_ON(err < 0); 3726 BUG_ON(err < 0);
3238 return 0; 3727 return 0;
3239} 3728}
3240 3729
3241
3242static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 3730static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3243 struct btrfs_root *root, 3731 struct btrfs_root *root,
3244 u64 bytenr, u64 num_bytes, u64 parent, 3732 u64 bytenr, u64 num_bytes, u64 parent,
@@ -3412,7 +3900,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3412 } 3900 }
3413 3901
3414 ret = pin_down_bytes(trans, root, path, bytenr, 3902 ret = pin_down_bytes(trans, root, path, bytenr,
3415 num_bytes, is_data, &must_clean); 3903 num_bytes, is_data, 0, &must_clean);
3416 if (ret > 0) 3904 if (ret > 0)
3417 mark_free = 1; 3905 mark_free = 1;
3418 BUG_ON(ret < 0); 3906 BUG_ON(ret < 0);
@@ -3543,8 +4031,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
3543 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 4031 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
3544 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 4032 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
3545 /* unlocks the pinned mutex */ 4033 /* unlocks the pinned mutex */
3546 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 4034 btrfs_pin_extent(root, bytenr, num_bytes, 1);
3547 update_reserved_extents(root, bytenr, num_bytes, 0);
3548 ret = 0; 4035 ret = 0;
3549 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 4036 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
3550 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 4037 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
@@ -3584,24 +4071,38 @@ static noinline int
3584wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 4071wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
3585 u64 num_bytes) 4072 u64 num_bytes)
3586{ 4073{
4074 struct btrfs_caching_control *caching_ctl;
3587 DEFINE_WAIT(wait); 4075 DEFINE_WAIT(wait);
3588 4076
3589 prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE); 4077 caching_ctl = get_caching_control(cache);
3590 4078 if (!caching_ctl)
3591 if (block_group_cache_done(cache)) {
3592 finish_wait(&cache->caching_q, &wait);
3593 return 0; 4079 return 0;
3594 }
3595 schedule();
3596 finish_wait(&cache->caching_q, &wait);
3597 4080
3598 wait_event(cache->caching_q, block_group_cache_done(cache) || 4081 wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
3599 (cache->free_space >= num_bytes)); 4082 (cache->free_space >= num_bytes));
4083
4084 put_caching_control(caching_ctl);
4085 return 0;
4086}
4087
4088static noinline int
4089wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4090{
4091 struct btrfs_caching_control *caching_ctl;
4092 DEFINE_WAIT(wait);
4093
4094 caching_ctl = get_caching_control(cache);
4095 if (!caching_ctl)
4096 return 0;
4097
4098 wait_event(caching_ctl->wait, block_group_cache_done(cache));
4099
4100 put_caching_control(caching_ctl);
3600 return 0; 4101 return 0;
3601} 4102}
3602 4103
3603enum btrfs_loop_type { 4104enum btrfs_loop_type {
3604 LOOP_CACHED_ONLY = 0, 4105 LOOP_FIND_IDEAL = 0,
3605 LOOP_CACHING_NOWAIT = 1, 4106 LOOP_CACHING_NOWAIT = 1,
3606 LOOP_CACHING_WAIT = 2, 4107 LOOP_CACHING_WAIT = 2,
3607 LOOP_ALLOC_CHUNK = 3, 4108 LOOP_ALLOC_CHUNK = 3,
@@ -3630,10 +4131,15 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3630 struct btrfs_block_group_cache *block_group = NULL; 4131 struct btrfs_block_group_cache *block_group = NULL;
3631 int empty_cluster = 2 * 1024 * 1024; 4132 int empty_cluster = 2 * 1024 * 1024;
3632 int allowed_chunk_alloc = 0; 4133 int allowed_chunk_alloc = 0;
4134 int done_chunk_alloc = 0;
3633 struct btrfs_space_info *space_info; 4135 struct btrfs_space_info *space_info;
3634 int last_ptr_loop = 0; 4136 int last_ptr_loop = 0;
3635 int loop = 0; 4137 int loop = 0;
3636 bool found_uncached_bg = false; 4138 bool found_uncached_bg = false;
4139 bool failed_cluster_refill = false;
4140 bool failed_alloc = false;
4141 u64 ideal_cache_percent = 0;
4142 u64 ideal_cache_offset = 0;
3637 4143
3638 WARN_ON(num_bytes < root->sectorsize); 4144 WARN_ON(num_bytes < root->sectorsize);
3639 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 4145 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3669,14 +4175,19 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3669 empty_cluster = 0; 4175 empty_cluster = 0;
3670 4176
3671 if (search_start == hint_byte) { 4177 if (search_start == hint_byte) {
4178ideal_cache:
3672 block_group = btrfs_lookup_block_group(root->fs_info, 4179 block_group = btrfs_lookup_block_group(root->fs_info,
3673 search_start); 4180 search_start);
3674 /* 4181 /*
3675 * we don't want to use the block group if it doesn't match our 4182 * we don't want to use the block group if it doesn't match our
3676 * allocation bits, or if its not cached. 4183 * allocation bits, or if its not cached.
4184 *
4185 * However if we are re-searching with an ideal block group
4186 * picked out then we don't care that the block group is cached.
3677 */ 4187 */
3678 if (block_group && block_group_bits(block_group, data) && 4188 if (block_group && block_group_bits(block_group, data) &&
3679 block_group_cache_done(block_group)) { 4189 (block_group->cached != BTRFS_CACHE_NO ||
4190 search_start == ideal_cache_offset)) {
3680 down_read(&space_info->groups_sem); 4191 down_read(&space_info->groups_sem);
3681 if (list_empty(&block_group->list) || 4192 if (list_empty(&block_group->list) ||
3682 block_group->ro) { 4193 block_group->ro) {
@@ -3688,13 +4199,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3688 */ 4199 */
3689 btrfs_put_block_group(block_group); 4200 btrfs_put_block_group(block_group);
3690 up_read(&space_info->groups_sem); 4201 up_read(&space_info->groups_sem);
3691 } else 4202 } else {
3692 goto have_block_group; 4203 goto have_block_group;
4204 }
3693 } else if (block_group) { 4205 } else if (block_group) {
3694 btrfs_put_block_group(block_group); 4206 btrfs_put_block_group(block_group);
3695 } 4207 }
3696 } 4208 }
3697
3698search: 4209search:
3699 down_read(&space_info->groups_sem); 4210 down_read(&space_info->groups_sem);
3700 list_for_each_entry(block_group, &space_info->block_groups, list) { 4211 list_for_each_entry(block_group, &space_info->block_groups, list) {
@@ -3706,32 +4217,58 @@ search:
3706 4217
3707have_block_group: 4218have_block_group:
3708 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4219 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4220 u64 free_percent;
4221
4222 free_percent = btrfs_block_group_used(&block_group->item);
4223 free_percent *= 100;
4224 free_percent = div64_u64(free_percent,
4225 block_group->key.offset);
4226 free_percent = 100 - free_percent;
4227 if (free_percent > ideal_cache_percent &&
4228 likely(!block_group->ro)) {
4229 ideal_cache_offset = block_group->key.objectid;
4230 ideal_cache_percent = free_percent;
4231 }
4232
3709 /* 4233 /*
3710 * we want to start caching kthreads, but not too many 4234 * We only want to start kthread caching if we are at
3711 * right off the bat so we don't overwhelm the system, 4235 * the point where we will wait for caching to make
3712 * so only start them if there are less than 2 and we're 4236 * progress, or if our ideal search is over and we've
3713 * in the initial allocation phase. 4237 * found somebody to start caching.
3714 */ 4238 */
3715 if (loop > LOOP_CACHING_NOWAIT || 4239 if (loop > LOOP_CACHING_NOWAIT ||
3716 atomic_read(&space_info->caching_threads) < 2) { 4240 (loop > LOOP_FIND_IDEAL &&
4241 atomic_read(&space_info->caching_threads) < 2)) {
3717 ret = cache_block_group(block_group); 4242 ret = cache_block_group(block_group);
3718 BUG_ON(ret); 4243 BUG_ON(ret);
3719 } 4244 }
3720 }
3721
3722 cached = block_group_cache_done(block_group);
3723 if (unlikely(!cached)) {
3724 found_uncached_bg = true; 4245 found_uncached_bg = true;
3725 4246
3726 /* if we only want cached bgs, loop */ 4247 /*
3727 if (loop == LOOP_CACHED_ONLY) 4248 * If loop is set for cached only, try the next block
4249 * group.
4250 */
4251 if (loop == LOOP_FIND_IDEAL)
3728 goto loop; 4252 goto loop;
3729 } 4253 }
3730 4254
4255 cached = block_group_cache_done(block_group);
4256 if (unlikely(!cached))
4257 found_uncached_bg = true;
4258
3731 if (unlikely(block_group->ro)) 4259 if (unlikely(block_group->ro))
3732 goto loop; 4260 goto loop;
3733 4261
3734 if (last_ptr) { 4262 /*
4263 * Ok we want to try and use the cluster allocator, so lets look
4264 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
4265 * have tried the cluster allocator plenty of times at this
4266 * point and not have found anything, so we are likely way too
4267 * fragmented for the clustering stuff to find anything, so lets
4268 * just skip it and let the allocator find whatever block it can
4269 * find
4270 */
4271 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
3735 /* 4272 /*
3736 * the refill lock keeps out other 4273 * the refill lock keeps out other
3737 * people trying to start a new cluster 4274 * people trying to start a new cluster
@@ -3806,9 +4343,11 @@ refill_cluster:
3806 spin_unlock(&last_ptr->refill_lock); 4343 spin_unlock(&last_ptr->refill_lock);
3807 goto checks; 4344 goto checks;
3808 } 4345 }
3809 } else if (!cached && loop > LOOP_CACHING_NOWAIT) { 4346 } else if (!cached && loop > LOOP_CACHING_NOWAIT
4347 && !failed_cluster_refill) {
3810 spin_unlock(&last_ptr->refill_lock); 4348 spin_unlock(&last_ptr->refill_lock);
3811 4349
4350 failed_cluster_refill = true;
3812 wait_block_group_cache_progress(block_group, 4351 wait_block_group_cache_progress(block_group,
3813 num_bytes + empty_cluster + empty_size); 4352 num_bytes + empty_cluster + empty_size);
3814 goto have_block_group; 4353 goto have_block_group;
@@ -3820,25 +4359,30 @@ refill_cluster:
3820 * cluster. Free the cluster we've been trying 4359 * cluster. Free the cluster we've been trying
3821 * to use, and go to the next block group 4360 * to use, and go to the next block group
3822 */ 4361 */
3823 if (loop < LOOP_NO_EMPTY_SIZE) { 4362 btrfs_return_cluster_to_free_space(NULL, last_ptr);
3824 btrfs_return_cluster_to_free_space(NULL,
3825 last_ptr);
3826 spin_unlock(&last_ptr->refill_lock);
3827 goto loop;
3828 }
3829 spin_unlock(&last_ptr->refill_lock); 4363 spin_unlock(&last_ptr->refill_lock);
4364 goto loop;
3830 } 4365 }
3831 4366
3832 offset = btrfs_find_space_for_alloc(block_group, search_start, 4367 offset = btrfs_find_space_for_alloc(block_group, search_start,
3833 num_bytes, empty_size); 4368 num_bytes, empty_size);
3834 if (!offset && (cached || (!cached && 4369 /*
3835 loop == LOOP_CACHING_NOWAIT))) { 4370 * If we didn't find a chunk, and we haven't failed on this
3836 goto loop; 4371 * block group before, and this block group is in the middle of
3837 } else if (!offset && (!cached && 4372 * caching and we are ok with waiting, then go ahead and wait
3838 loop > LOOP_CACHING_NOWAIT)) { 4373 * for progress to be made, and set failed_alloc to true.
4374 *
4375 * If failed_alloc is true then we've already waited on this
4376 * block group once and should move on to the next block group.
4377 */
4378 if (!offset && !failed_alloc && !cached &&
4379 loop > LOOP_CACHING_NOWAIT) {
3839 wait_block_group_cache_progress(block_group, 4380 wait_block_group_cache_progress(block_group,
3840 num_bytes + empty_size); 4381 num_bytes + empty_size);
4382 failed_alloc = true;
3841 goto have_block_group; 4383 goto have_block_group;
4384 } else if (!offset) {
4385 goto loop;
3842 } 4386 }
3843checks: 4387checks:
3844 search_start = stripe_align(root, offset); 4388 search_start = stripe_align(root, offset);
@@ -3880,16 +4424,22 @@ checks:
3880 search_start - offset); 4424 search_start - offset);
3881 BUG_ON(offset > search_start); 4425 BUG_ON(offset > search_start);
3882 4426
4427 update_reserved_extents(block_group, num_bytes, 1);
4428
3883 /* we are all good, lets return */ 4429 /* we are all good, lets return */
3884 break; 4430 break;
3885loop: 4431loop:
4432 failed_cluster_refill = false;
4433 failed_alloc = false;
3886 btrfs_put_block_group(block_group); 4434 btrfs_put_block_group(block_group);
3887 } 4435 }
3888 up_read(&space_info->groups_sem); 4436 up_read(&space_info->groups_sem);
3889 4437
3890 /* LOOP_CACHED_ONLY, only search fully cached block groups 4438 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
3891 * LOOP_CACHING_NOWAIT, search partially cached block groups, but 4439 * for them to make caching progress. Also
3892 * dont wait foR them to finish caching 4440 * determine the best possible bg to cache
4441 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
4442 * caching kthreads as we move along
3893 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 4443 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3894 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 4444 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3895 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 4445 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
@@ -3898,12 +4448,47 @@ loop:
3898 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4448 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
3899 (found_uncached_bg || empty_size || empty_cluster || 4449 (found_uncached_bg || empty_size || empty_cluster ||
3900 allowed_chunk_alloc)) { 4450 allowed_chunk_alloc)) {
3901 if (found_uncached_bg) { 4451 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
3902 found_uncached_bg = false; 4452 found_uncached_bg = false;
3903 if (loop < LOOP_CACHING_WAIT) { 4453 loop++;
3904 loop++; 4454 if (!ideal_cache_percent &&
4455 atomic_read(&space_info->caching_threads))
3905 goto search; 4456 goto search;
3906 } 4457
4458 /*
4459 * 1 of the following 2 things have happened so far
4460 *
4461 * 1) We found an ideal block group for caching that
4462 * is mostly full and will cache quickly, so we might
4463 * as well wait for it.
4464 *
4465 * 2) We searched for cached only and we didn't find
4466 * anything, and we didn't start any caching kthreads
4467 * either, so chances are we will loop through and
4468 * start a couple caching kthreads, and then come back
4469 * around and just wait for them. This will be slower
4470 * because we will have 2 caching kthreads reading at
4471 * the same time when we could have just started one
4472 * and waited for it to get far enough to give us an
4473 * allocation, so go ahead and go to the wait caching
4474 * loop.
4475 */
4476 loop = LOOP_CACHING_WAIT;
4477 search_start = ideal_cache_offset;
4478 ideal_cache_percent = 0;
4479 goto ideal_cache;
4480 } else if (loop == LOOP_FIND_IDEAL) {
4481 /*
4482 * Didn't find a uncached bg, wait on anything we find
4483 * next.
4484 */
4485 loop = LOOP_CACHING_WAIT;
4486 goto search;
4487 }
4488
4489 if (loop < LOOP_CACHING_WAIT) {
4490 loop++;
4491 goto search;
3907 } 4492 }
3908 4493
3909 if (loop == LOOP_ALLOC_CHUNK) { 4494 if (loop == LOOP_ALLOC_CHUNK) {
@@ -3915,7 +4500,8 @@ loop:
3915 ret = do_chunk_alloc(trans, root, num_bytes + 4500 ret = do_chunk_alloc(trans, root, num_bytes +
3916 2 * 1024 * 1024, data, 1); 4501 2 * 1024 * 1024, data, 1);
3917 allowed_chunk_alloc = 0; 4502 allowed_chunk_alloc = 0;
3918 } else { 4503 done_chunk_alloc = 1;
4504 } else if (!done_chunk_alloc) {
3919 space_info->force_alloc = 1; 4505 space_info->force_alloc = 1;
3920 } 4506 }
3921 4507
@@ -3940,21 +4526,32 @@ loop:
3940 return ret; 4526 return ret;
3941} 4527}
3942 4528
3943static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 4529static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4530 int dump_block_groups)
3944{ 4531{
3945 struct btrfs_block_group_cache *cache; 4532 struct btrfs_block_group_cache *cache;
3946 4533
4534 spin_lock(&info->lock);
3947 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4535 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3948 (unsigned long long)(info->total_bytes - info->bytes_used - 4536 (unsigned long long)(info->total_bytes - info->bytes_used -
3949 info->bytes_pinned - info->bytes_reserved), 4537 info->bytes_pinned - info->bytes_reserved -
4538 info->bytes_super),
3950 (info->full) ? "" : "not "); 4539 (info->full) ? "" : "not ");
3951 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4540 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
3952 " may_use=%llu, used=%llu\n", 4541 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
4542 "\n",
3953 (unsigned long long)info->total_bytes, 4543 (unsigned long long)info->total_bytes,
3954 (unsigned long long)info->bytes_pinned, 4544 (unsigned long long)info->bytes_pinned,
3955 (unsigned long long)info->bytes_delalloc, 4545 (unsigned long long)info->bytes_delalloc,
3956 (unsigned long long)info->bytes_may_use, 4546 (unsigned long long)info->bytes_may_use,
3957 (unsigned long long)info->bytes_used); 4547 (unsigned long long)info->bytes_used,
4548 (unsigned long long)info->bytes_root,
4549 (unsigned long long)info->bytes_super,
4550 (unsigned long long)info->bytes_reserved);
4551 spin_unlock(&info->lock);
4552
4553 if (!dump_block_groups)
4554 return;
3958 4555
3959 down_read(&info->groups_sem); 4556 down_read(&info->groups_sem);
3960 list_for_each_entry(cache, &info->block_groups, list) { 4557 list_for_each_entry(cache, &info->block_groups, list) {
@@ -3972,12 +4569,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3972 up_read(&info->groups_sem); 4569 up_read(&info->groups_sem);
3973} 4570}
3974 4571
3975static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, 4572int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3976 struct btrfs_root *root, 4573 struct btrfs_root *root,
3977 u64 num_bytes, u64 min_alloc_size, 4574 u64 num_bytes, u64 min_alloc_size,
3978 u64 empty_size, u64 hint_byte, 4575 u64 empty_size, u64 hint_byte,
3979 u64 search_end, struct btrfs_key *ins, 4576 u64 search_end, struct btrfs_key *ins,
3980 u64 data) 4577 u64 data)
3981{ 4578{
3982 int ret; 4579 int ret;
3983 u64 search_start = 0; 4580 u64 search_start = 0;
@@ -4022,7 +4619,7 @@ again:
4022 printk(KERN_ERR "btrfs allocation failed flags %llu, " 4619 printk(KERN_ERR "btrfs allocation failed flags %llu, "
4023 "wanted %llu\n", (unsigned long long)data, 4620 "wanted %llu\n", (unsigned long long)data,
4024 (unsigned long long)num_bytes); 4621 (unsigned long long)num_bytes);
4025 dump_space_info(sinfo, num_bytes); 4622 dump_space_info(sinfo, num_bytes, 1);
4026 } 4623 }
4027 4624
4028 return ret; 4625 return ret;
@@ -4043,25 +4640,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4043 ret = btrfs_discard_extent(root, start, len); 4640 ret = btrfs_discard_extent(root, start, len);
4044 4641
4045 btrfs_add_free_space(cache, start, len); 4642 btrfs_add_free_space(cache, start, len);
4643 update_reserved_extents(cache, len, 0);
4046 btrfs_put_block_group(cache); 4644 btrfs_put_block_group(cache);
4047 update_reserved_extents(root, start, len, 0);
4048
4049 return ret;
4050}
4051
4052int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
4053 struct btrfs_root *root,
4054 u64 num_bytes, u64 min_alloc_size,
4055 u64 empty_size, u64 hint_byte,
4056 u64 search_end, struct btrfs_key *ins,
4057 u64 data)
4058{
4059 int ret;
4060 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
4061 empty_size, hint_byte, search_end, ins,
4062 data);
4063 if (!ret)
4064 update_reserved_extents(root, ins->objectid, ins->offset, 1);
4065 4645
4066 return ret; 4646 return ret;
4067} 4647}
@@ -4222,15 +4802,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4222{ 4802{
4223 int ret; 4803 int ret;
4224 struct btrfs_block_group_cache *block_group; 4804 struct btrfs_block_group_cache *block_group;
4805 struct btrfs_caching_control *caching_ctl;
4806 u64 start = ins->objectid;
4807 u64 num_bytes = ins->offset;
4225 4808
4226 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 4809 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
4227 cache_block_group(block_group); 4810 cache_block_group(block_group);
4228 wait_event(block_group->caching_q, 4811 caching_ctl = get_caching_control(block_group);
4229 block_group_cache_done(block_group));
4230 4812
4231 ret = btrfs_remove_free_space(block_group, ins->objectid, 4813 if (!caching_ctl) {
4232 ins->offset); 4814 BUG_ON(!block_group_cache_done(block_group));
4233 BUG_ON(ret); 4815 ret = btrfs_remove_free_space(block_group, start, num_bytes);
4816 BUG_ON(ret);
4817 } else {
4818 mutex_lock(&caching_ctl->mutex);
4819
4820 if (start >= caching_ctl->progress) {
4821 ret = add_excluded_extent(root, start, num_bytes);
4822 BUG_ON(ret);
4823 } else if (start + num_bytes <= caching_ctl->progress) {
4824 ret = btrfs_remove_free_space(block_group,
4825 start, num_bytes);
4826 BUG_ON(ret);
4827 } else {
4828 num_bytes = caching_ctl->progress - start;
4829 ret = btrfs_remove_free_space(block_group,
4830 start, num_bytes);
4831 BUG_ON(ret);
4832
4833 start = caching_ctl->progress;
4834 num_bytes = ins->objectid + ins->offset -
4835 caching_ctl->progress;
4836 ret = add_excluded_extent(root, start, num_bytes);
4837 BUG_ON(ret);
4838 }
4839
4840 mutex_unlock(&caching_ctl->mutex);
4841 put_caching_control(caching_ctl);
4842 }
4843
4844 update_reserved_extents(block_group, ins->offset, 1);
4234 btrfs_put_block_group(block_group); 4845 btrfs_put_block_group(block_group);
4235 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 4846 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4236 0, owner, offset, ins, 1); 4847 0, owner, offset, ins, 1);
@@ -4254,9 +4865,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
4254 int ret; 4865 int ret;
4255 u64 flags = 0; 4866 u64 flags = 0;
4256 4867
4257 ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, 4868 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4258 empty_size, hint_byte, search_end, 4869 empty_size, hint_byte, search_end,
4259 ins, 0); 4870 ins, 0);
4260 if (ret) 4871 if (ret)
4261 return ret; 4872 return ret;
4262 4873
@@ -4267,7 +4878,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
4267 } else 4878 } else
4268 BUG_ON(parent > 0); 4879 BUG_ON(parent > 0);
4269 4880
4270 update_reserved_extents(root, ins->objectid, ins->offset, 1);
4271 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 4881 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4272 struct btrfs_delayed_extent_op *extent_op; 4882 struct btrfs_delayed_extent_op *extent_op;
4273 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 4883 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
@@ -4346,452 +4956,108 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4346 return buf; 4956 return buf;
4347} 4957}
4348 4958
4349#if 0 4959struct walk_control {
4350int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 4960 u64 refs[BTRFS_MAX_LEVEL];
4351 struct btrfs_root *root, struct extent_buffer *leaf) 4961 u64 flags[BTRFS_MAX_LEVEL];
4352{ 4962 struct btrfs_key update_progress;
4353 u64 disk_bytenr; 4963 int stage;
4354 u64 num_bytes; 4964 int level;
4355 struct btrfs_key key; 4965 int shared_level;
4356 struct btrfs_file_extent_item *fi; 4966 int update_ref;
4357 u32 nritems; 4967 int keep_locks;
4358 int i; 4968 int reada_slot;
4359 int ret; 4969 int reada_count;
4360 4970};
4361 BUG_ON(!btrfs_is_leaf(leaf));
4362 nritems = btrfs_header_nritems(leaf);
4363
4364 for (i = 0; i < nritems; i++) {
4365 cond_resched();
4366 btrfs_item_key_to_cpu(leaf, &key, i);
4367
4368 /* only extents have references, skip everything else */
4369 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
4370 continue;
4371
4372 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
4373
4374 /* inline extents live in the btree, they don't have refs */
4375 if (btrfs_file_extent_type(leaf, fi) ==
4376 BTRFS_FILE_EXTENT_INLINE)
4377 continue;
4378
4379 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
4380
4381 /* holes don't have refs */
4382 if (disk_bytenr == 0)
4383 continue;
4384
4385 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
4386 ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
4387 leaf->start, 0, key.objectid, 0);
4388 BUG_ON(ret);
4389 }
4390 return 0;
4391}
4392
4393static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
4394 struct btrfs_root *root,
4395 struct btrfs_leaf_ref *ref)
4396{
4397 int i;
4398 int ret;
4399 struct btrfs_extent_info *info;
4400 struct refsort *sorted;
4401
4402 if (ref->nritems == 0)
4403 return 0;
4404
4405 sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
4406 for (i = 0; i < ref->nritems; i++) {
4407 sorted[i].bytenr = ref->extents[i].bytenr;
4408 sorted[i].slot = i;
4409 }
4410 sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
4411
4412 /*
4413 * the items in the ref were sorted when the ref was inserted
4414 * into the ref cache, so this is already in order
4415 */
4416 for (i = 0; i < ref->nritems; i++) {
4417 info = ref->extents + sorted[i].slot;
4418 ret = btrfs_free_extent(trans, root, info->bytenr,
4419 info->num_bytes, ref->bytenr,
4420 ref->owner, ref->generation,
4421 info->objectid, 0);
4422
4423 atomic_inc(&root->fs_info->throttle_gen);
4424 wake_up(&root->fs_info->transaction_throttle);
4425 cond_resched();
4426
4427 BUG_ON(ret);
4428 info++;
4429 }
4430
4431 kfree(sorted);
4432 return 0;
4433}
4434
4435
4436static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
4437 struct btrfs_root *root, u64 start,
4438 u64 len, u32 *refs)
4439{
4440 int ret;
4441
4442 ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
4443 BUG_ON(ret);
4444
4445#if 0 /* some debugging code in case we see problems here */
4446 /* if the refs count is one, it won't get increased again. But
4447 * if the ref count is > 1, someone may be decreasing it at
4448 * the same time we are.
4449 */
4450 if (*refs != 1) {
4451 struct extent_buffer *eb = NULL;
4452 eb = btrfs_find_create_tree_block(root, start, len);
4453 if (eb)
4454 btrfs_tree_lock(eb);
4455
4456 mutex_lock(&root->fs_info->alloc_mutex);
4457 ret = lookup_extent_ref(NULL, root, start, len, refs);
4458 BUG_ON(ret);
4459 mutex_unlock(&root->fs_info->alloc_mutex);
4460
4461 if (eb) {
4462 btrfs_tree_unlock(eb);
4463 free_extent_buffer(eb);
4464 }
4465 if (*refs == 1) {
4466 printk(KERN_ERR "btrfs block %llu went down to one "
4467 "during drop_snap\n", (unsigned long long)start);
4468 }
4469
4470 }
4471#endif
4472
4473 cond_resched();
4474 return ret;
4475}
4476 4971
4972#define DROP_REFERENCE 1
4973#define UPDATE_BACKREF 2
4477 4974
4478/* 4975static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4479 * this is used while deleting old snapshots, and it drops the refs 4976 struct btrfs_root *root,
4480 * on a whole subtree starting from a level 1 node. 4977 struct walk_control *wc,
4481 * 4978 struct btrfs_path *path)
4482 * The idea is to sort all the leaf pointers, and then drop the
4483 * ref on all the leaves in order. Most of the time the leaves
4484 * will have ref cache entries, so no leaf IOs will be required to
4485 * find the extents they have references on.
4486 *
4487 * For each leaf, any references it has are also dropped in order
4488 *
4489 * This ends up dropping the references in something close to optimal
4490 * order for reading and modifying the extent allocation tree.
4491 */
4492static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
4493 struct btrfs_root *root,
4494 struct btrfs_path *path)
4495{ 4979{
4496 u64 bytenr; 4980 u64 bytenr;
4497 u64 root_owner; 4981 u64 generation;
4498 u64 root_gen; 4982 u64 refs;
4499 struct extent_buffer *eb = path->nodes[1]; 4983 u64 flags;
4500 struct extent_buffer *leaf; 4984 u64 last = 0;
4501 struct btrfs_leaf_ref *ref; 4985 u32 nritems;
4502 struct refsort *sorted = NULL; 4986 u32 blocksize;
4503 int nritems = btrfs_header_nritems(eb); 4987 struct btrfs_key key;
4988 struct extent_buffer *eb;
4504 int ret; 4989 int ret;
4505 int i; 4990 int slot;
4506 int refi = 0; 4991 int nread = 0;
4507 int slot = path->slots[1];
4508 u32 blocksize = btrfs_level_size(root, 0);
4509 u32 refs;
4510
4511 if (nritems == 0)
4512 goto out;
4513
4514 root_owner = btrfs_header_owner(eb);
4515 root_gen = btrfs_header_generation(eb);
4516 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
4517 4992
4518 /* 4993 if (path->slots[wc->level] < wc->reada_slot) {
4519 * step one, sort all the leaf pointers so we don't scribble 4994 wc->reada_count = wc->reada_count * 2 / 3;
4520 * randomly into the extent allocation tree 4995 wc->reada_count = max(wc->reada_count, 2);
4521 */ 4996 } else {
4522 for (i = slot; i < nritems; i++) { 4997 wc->reada_count = wc->reada_count * 3 / 2;
4523 sorted[refi].bytenr = btrfs_node_blockptr(eb, i); 4998 wc->reada_count = min_t(int, wc->reada_count,
4524 sorted[refi].slot = i; 4999 BTRFS_NODEPTRS_PER_BLOCK(root));
4525 refi++;
4526 } 5000 }
4527 5001
4528 /* 5002 eb = path->nodes[wc->level];
4529 * nritems won't be zero, but if we're picking up drop_snapshot 5003 nritems = btrfs_header_nritems(eb);
4530 * after a crash, slot might be > 0, so double check things 5004 blocksize = btrfs_level_size(root, wc->level - 1);
4531 * just in case.
4532 */
4533 if (refi == 0)
4534 goto out;
4535 5005
4536 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); 5006 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5007 if (nread >= wc->reada_count)
5008 break;
4537 5009
4538 /* 5010 cond_resched();
4539 * the first loop frees everything the leaves point to 5011 bytenr = btrfs_node_blockptr(eb, slot);
4540 */ 5012 generation = btrfs_node_ptr_generation(eb, slot);
4541 for (i = 0; i < refi; i++) {
4542 u64 ptr_gen;
4543 5013
4544 bytenr = sorted[i].bytenr; 5014 if (slot == path->slots[wc->level])
5015 goto reada;
4545 5016
4546 /* 5017 if (wc->stage == UPDATE_BACKREF &&
4547 * check the reference count on this leaf. If it is > 1 5018 generation <= root->root_key.offset)
4548 * we just decrement it below and don't update any
4549 * of the refs the leaf points to.
4550 */
4551 ret = drop_snap_lookup_refcount(trans, root, bytenr,
4552 blocksize, &refs);
4553 BUG_ON(ret);
4554 if (refs != 1)
4555 continue; 5019 continue;
4556 5020
4557 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); 5021 /* We don't lock the tree block, it's OK to be racy here */
4558 5022 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
4559 /* 5023 &refs, &flags);
4560 * the leaf only had one reference, which means the
4561 * only thing pointing to this leaf is the snapshot
4562 * we're deleting. It isn't possible for the reference
4563 * count to increase again later
4564 *
4565 * The reference cache is checked for the leaf,
4566 * and if found we'll be able to drop any refs held by
4567 * the leaf without needing to read it in.
4568 */
4569 ref = btrfs_lookup_leaf_ref(root, bytenr);
4570 if (ref && ref->generation != ptr_gen) {
4571 btrfs_free_leaf_ref(root, ref);
4572 ref = NULL;
4573 }
4574 if (ref) {
4575 ret = cache_drop_leaf_ref(trans, root, ref);
4576 BUG_ON(ret);
4577 btrfs_remove_leaf_ref(root, ref);
4578 btrfs_free_leaf_ref(root, ref);
4579 } else {
4580 /*
4581 * the leaf wasn't in the reference cache, so
4582 * we have to read it.
4583 */
4584 leaf = read_tree_block(root, bytenr, blocksize,
4585 ptr_gen);
4586 ret = btrfs_drop_leaf_ref(trans, root, leaf);
4587 BUG_ON(ret);
4588 free_extent_buffer(leaf);
4589 }
4590 atomic_inc(&root->fs_info->throttle_gen);
4591 wake_up(&root->fs_info->transaction_throttle);
4592 cond_resched();
4593 }
4594
4595 /*
4596 * run through the loop again to free the refs on the leaves.
4597 * This is faster than doing it in the loop above because
4598 * the leaves are likely to be clustered together. We end up
4599 * working in nice chunks on the extent allocation tree.
4600 */
4601 for (i = 0; i < refi; i++) {
4602 bytenr = sorted[i].bytenr;
4603 ret = btrfs_free_extent(trans, root, bytenr,
4604 blocksize, eb->start,
4605 root_owner, root_gen, 0, 1);
4606 BUG_ON(ret); 5024 BUG_ON(ret);
5025 BUG_ON(refs == 0);
4607 5026
4608 atomic_inc(&root->fs_info->throttle_gen); 5027 if (wc->stage == DROP_REFERENCE) {
4609 wake_up(&root->fs_info->transaction_throttle); 5028 if (refs == 1)
4610 cond_resched(); 5029 goto reada;
4611 }
4612out:
4613 kfree(sorted);
4614
4615 /*
4616 * update the path to show we've processed the entire level 1
4617 * node. This will get saved into the root's drop_snapshot_progress
4618 * field so these drops are not repeated again if this transaction
4619 * commits.
4620 */
4621 path->slots[1] = nritems;
4622 return 0;
4623}
4624
4625/*
4626 * helper function for drop_snapshot, this walks down the tree dropping ref
4627 * counts as it goes.
4628 */
4629static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4630 struct btrfs_root *root,
4631 struct btrfs_path *path, int *level)
4632{
4633 u64 root_owner;
4634 u64 root_gen;
4635 u64 bytenr;
4636 u64 ptr_gen;
4637 struct extent_buffer *next;
4638 struct extent_buffer *cur;
4639 struct extent_buffer *parent;
4640 u32 blocksize;
4641 int ret;
4642 u32 refs;
4643
4644 WARN_ON(*level < 0);
4645 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4646 ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
4647 path->nodes[*level]->len, &refs);
4648 BUG_ON(ret);
4649 if (refs > 1)
4650 goto out;
4651
4652 /*
4653 * walk down to the last node level and free all the leaves
4654 */
4655 while (*level >= 0) {
4656 WARN_ON(*level < 0);
4657 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4658 cur = path->nodes[*level];
4659
4660 if (btrfs_header_level(cur) != *level)
4661 WARN_ON(1);
4662
4663 if (path->slots[*level] >=
4664 btrfs_header_nritems(cur))
4665 break;
4666 5030
4667 /* the new code goes down to level 1 and does all the 5031 if (wc->level == 1 &&
4668 * leaves pointed to that node in bulk. So, this check 5032 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4669 * for level 0 will always be false. 5033 continue;
4670 * 5034 if (!wc->update_ref ||
4671 * But, the disk format allows the drop_snapshot_progress 5035 generation <= root->root_key.offset)
4672 * field in the root to leave things in a state where 5036 continue;
4673 * a leaf will need cleaning up here. If someone crashes 5037 btrfs_node_key_to_cpu(eb, &key, slot);
4674 * with the old code and then boots with the new code, 5038 ret = btrfs_comp_cpu_keys(&key,
4675 * we might find a leaf here. 5039 &wc->update_progress);
4676 */ 5040 if (ret < 0)
4677 if (*level == 0) { 5041 continue;
4678 ret = btrfs_drop_leaf_ref(trans, root, cur); 5042 } else {
4679 BUG_ON(ret); 5043 if (wc->level == 1 &&
4680 break; 5044 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5045 continue;
4681 } 5046 }
4682 5047reada:
4683 /* 5048 ret = readahead_tree_block(root, bytenr, blocksize,
4684 * once we get to level one, process the whole node 5049 generation);
4685 * at once, including everything below it. 5050 if (ret)
4686 */
4687 if (*level == 1) {
4688 ret = drop_level_one_refs(trans, root, path);
4689 BUG_ON(ret);
4690 break; 5051 break;
4691 } 5052 last = bytenr + blocksize;
4692 5053 nread++;
4693 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
4694 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4695 blocksize = btrfs_level_size(root, *level - 1);
4696
4697 ret = drop_snap_lookup_refcount(trans, root, bytenr,
4698 blocksize, &refs);
4699 BUG_ON(ret);
4700
4701 /*
4702 * if there is more than one reference, we don't need
4703 * to read that node to drop any references it has. We
4704 * just drop the ref we hold on that node and move on to the
4705 * next slot in this level.
4706 */
4707 if (refs != 1) {
4708 parent = path->nodes[*level];
4709 root_owner = btrfs_header_owner(parent);
4710 root_gen = btrfs_header_generation(parent);
4711 path->slots[*level]++;
4712
4713 ret = btrfs_free_extent(trans, root, bytenr,
4714 blocksize, parent->start,
4715 root_owner, root_gen,
4716 *level - 1, 1);
4717 BUG_ON(ret);
4718
4719 atomic_inc(&root->fs_info->throttle_gen);
4720 wake_up(&root->fs_info->transaction_throttle);
4721 cond_resched();
4722
4723 continue;
4724 }
4725
4726 /*
4727 * we need to keep freeing things in the next level down.
4728 * read the block and loop around to process it
4729 */
4730 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
4731 WARN_ON(*level <= 0);
4732 if (path->nodes[*level-1])
4733 free_extent_buffer(path->nodes[*level-1]);
4734 path->nodes[*level-1] = next;
4735 *level = btrfs_header_level(next);
4736 path->slots[*level] = 0;
4737 cond_resched();
4738 } 5054 }
4739out: 5055 wc->reada_slot = slot;
4740 WARN_ON(*level < 0);
4741 WARN_ON(*level >= BTRFS_MAX_LEVEL);
4742
4743 if (path->nodes[*level] == root->node) {
4744 parent = path->nodes[*level];
4745 bytenr = path->nodes[*level]->start;
4746 } else {
4747 parent = path->nodes[*level + 1];
4748 bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
4749 }
4750
4751 blocksize = btrfs_level_size(root, *level);
4752 root_owner = btrfs_header_owner(parent);
4753 root_gen = btrfs_header_generation(parent);
4754
4755 /*
4756 * cleanup and free the reference on the last node
4757 * we processed
4758 */
4759 ret = btrfs_free_extent(trans, root, bytenr, blocksize,
4760 parent->start, root_owner, root_gen,
4761 *level, 1);
4762 free_extent_buffer(path->nodes[*level]);
4763 path->nodes[*level] = NULL;
4764
4765 *level += 1;
4766 BUG_ON(ret);
4767
4768 cond_resched();
4769 return 0;
4770} 5056}
4771#endif
4772
4773struct walk_control {
4774 u64 refs[BTRFS_MAX_LEVEL];
4775 u64 flags[BTRFS_MAX_LEVEL];
4776 struct btrfs_key update_progress;
4777 int stage;
4778 int level;
4779 int shared_level;
4780 int update_ref;
4781 int keep_locks;
4782};
4783
4784#define DROP_REFERENCE 1
4785#define UPDATE_BACKREF 2
4786 5057
4787/* 5058/*
4788 * hepler to process tree block while walking down the tree. 5059 * hepler to process tree block while walking down the tree.
4789 * 5060 *
4790 * when wc->stage == DROP_REFERENCE, this function checks
4791 * reference count of the block. if the block is shared and
4792 * we need update back refs for the subtree rooted at the
4793 * block, this function changes wc->stage to UPDATE_BACKREF
4794 *
4795 * when wc->stage == UPDATE_BACKREF, this function updates 5061 * when wc->stage == UPDATE_BACKREF, this function updates
4796 * back refs for pointers in the block. 5062 * back refs for pointers in the block.
4797 * 5063 *
@@ -4800,11 +5066,10 @@ struct walk_control {
4800static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 5066static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4801 struct btrfs_root *root, 5067 struct btrfs_root *root,
4802 struct btrfs_path *path, 5068 struct btrfs_path *path,
4803 struct walk_control *wc) 5069 struct walk_control *wc, int lookup_info)
4804{ 5070{
4805 int level = wc->level; 5071 int level = wc->level;
4806 struct extent_buffer *eb = path->nodes[level]; 5072 struct extent_buffer *eb = path->nodes[level];
4807 struct btrfs_key key;
4808 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 5073 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
4809 int ret; 5074 int ret;
4810 5075
@@ -4816,8 +5081,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4816 * when reference count of tree block is 1, it won't increase 5081 * when reference count of tree block is 1, it won't increase
4817 * again. once full backref flag is set, we never clear it. 5082 * again. once full backref flag is set, we never clear it.
4818 */ 5083 */
4819 if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 5084 if (lookup_info &&
4820 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { 5085 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5086 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
4821 BUG_ON(!path->locks[level]); 5087 BUG_ON(!path->locks[level]);
4822 ret = btrfs_lookup_extent_info(trans, root, 5088 ret = btrfs_lookup_extent_info(trans, root,
4823 eb->start, eb->len, 5089 eb->start, eb->len,
@@ -4827,21 +5093,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4827 BUG_ON(wc->refs[level] == 0); 5093 BUG_ON(wc->refs[level] == 0);
4828 } 5094 }
4829 5095
4830 if (wc->stage == DROP_REFERENCE &&
4831 wc->update_ref && wc->refs[level] > 1) {
4832 BUG_ON(eb == root->node);
4833 BUG_ON(path->slots[level] > 0);
4834 if (level == 0)
4835 btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
4836 else
4837 btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
4838 if (btrfs_header_owner(eb) == root->root_key.objectid &&
4839 btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
4840 wc->stage = UPDATE_BACKREF;
4841 wc->shared_level = level;
4842 }
4843 }
4844
4845 if (wc->stage == DROP_REFERENCE) { 5096 if (wc->stage == DROP_REFERENCE) {
4846 if (wc->refs[level] > 1) 5097 if (wc->refs[level] > 1)
4847 return 1; 5098 return 1;
@@ -4878,6 +5129,136 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4878} 5129}
4879 5130
4880/* 5131/*
5132 * hepler to process tree block pointer.
5133 *
5134 * when wc->stage == DROP_REFERENCE, this function checks
5135 * reference count of the block pointed to. if the block
5136 * is shared and we need update back refs for the subtree
5137 * rooted at the block, this function changes wc->stage to
5138 * UPDATE_BACKREF. if the block is shared and there is no
5139 * need to update back, this function drops the reference
5140 * to the block.
5141 *
5142 * NOTE: return value 1 means we should stop walking down.
5143 */
5144static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5145 struct btrfs_root *root,
5146 struct btrfs_path *path,
5147 struct walk_control *wc, int *lookup_info)
5148{
5149 u64 bytenr;
5150 u64 generation;
5151 u64 parent;
5152 u32 blocksize;
5153 struct btrfs_key key;
5154 struct extent_buffer *next;
5155 int level = wc->level;
5156 int reada = 0;
5157 int ret = 0;
5158
5159 generation = btrfs_node_ptr_generation(path->nodes[level],
5160 path->slots[level]);
5161 /*
5162 * if the lower level block was created before the snapshot
5163 * was created, we know there is no need to update back refs
5164 * for the subtree
5165 */
5166 if (wc->stage == UPDATE_BACKREF &&
5167 generation <= root->root_key.offset) {
5168 *lookup_info = 1;
5169 return 1;
5170 }
5171
5172 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5173 blocksize = btrfs_level_size(root, level - 1);
5174
5175 next = btrfs_find_tree_block(root, bytenr, blocksize);
5176 if (!next) {
5177 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5178 reada = 1;
5179 }
5180 btrfs_tree_lock(next);
5181 btrfs_set_lock_blocking(next);
5182
5183 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5184 &wc->refs[level - 1],
5185 &wc->flags[level - 1]);
5186 BUG_ON(ret);
5187 BUG_ON(wc->refs[level - 1] == 0);
5188 *lookup_info = 0;
5189
5190 if (wc->stage == DROP_REFERENCE) {
5191 if (wc->refs[level - 1] > 1) {
5192 if (level == 1 &&
5193 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5194 goto skip;
5195
5196 if (!wc->update_ref ||
5197 generation <= root->root_key.offset)
5198 goto skip;
5199
5200 btrfs_node_key_to_cpu(path->nodes[level], &key,
5201 path->slots[level]);
5202 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5203 if (ret < 0)
5204 goto skip;
5205
5206 wc->stage = UPDATE_BACKREF;
5207 wc->shared_level = level - 1;
5208 }
5209 } else {
5210 if (level == 1 &&
5211 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5212 goto skip;
5213 }
5214
5215 if (!btrfs_buffer_uptodate(next, generation)) {
5216 btrfs_tree_unlock(next);
5217 free_extent_buffer(next);
5218 next = NULL;
5219 *lookup_info = 1;
5220 }
5221
5222 if (!next) {
5223 if (reada && level == 1)
5224 reada_walk_down(trans, root, wc, path);
5225 next = read_tree_block(root, bytenr, blocksize, generation);
5226 btrfs_tree_lock(next);
5227 btrfs_set_lock_blocking(next);
5228 }
5229
5230 level--;
5231 BUG_ON(level != btrfs_header_level(next));
5232 path->nodes[level] = next;
5233 path->slots[level] = 0;
5234 path->locks[level] = 1;
5235 wc->level = level;
5236 if (wc->level == 1)
5237 wc->reada_slot = 0;
5238 return 0;
5239skip:
5240 wc->refs[level - 1] = 0;
5241 wc->flags[level - 1] = 0;
5242 if (wc->stage == DROP_REFERENCE) {
5243 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5244 parent = path->nodes[level]->start;
5245 } else {
5246 BUG_ON(root->root_key.objectid !=
5247 btrfs_header_owner(path->nodes[level]));
5248 parent = 0;
5249 }
5250
5251 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
5252 root->root_key.objectid, level - 1, 0);
5253 BUG_ON(ret);
5254 }
5255 btrfs_tree_unlock(next);
5256 free_extent_buffer(next);
5257 *lookup_info = 1;
5258 return 1;
5259}
5260
5261/*
4881 * hepler to process tree block while walking up the tree. 5262 * hepler to process tree block while walking up the tree.
4882 * 5263 *
4883 * when wc->stage == DROP_REFERENCE, this function drops 5264 * when wc->stage == DROP_REFERENCE, this function drops
@@ -4904,7 +5285,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
4904 if (level < wc->shared_level) 5285 if (level < wc->shared_level)
4905 goto out; 5286 goto out;
4906 5287
4907 BUG_ON(wc->refs[level] <= 1);
4908 ret = find_next_key(path, level + 1, &wc->update_progress); 5288 ret = find_next_key(path, level + 1, &wc->update_progress);
4909 if (ret > 0) 5289 if (ret > 0)
4910 wc->update_ref = 0; 5290 wc->update_ref = 0;
@@ -4935,8 +5315,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
4935 path->locks[level] = 0; 5315 path->locks[level] = 0;
4936 return 1; 5316 return 1;
4937 } 5317 }
4938 } else {
4939 BUG_ON(level != 0);
4940 } 5318 }
4941 } 5319 }
4942 5320
@@ -4989,39 +5367,28 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4989 struct btrfs_path *path, 5367 struct btrfs_path *path,
4990 struct walk_control *wc) 5368 struct walk_control *wc)
4991{ 5369{
4992 struct extent_buffer *next;
4993 struct extent_buffer *cur;
4994 u64 bytenr;
4995 u64 ptr_gen;
4996 u32 blocksize;
4997 int level = wc->level; 5370 int level = wc->level;
5371 int lookup_info = 1;
4998 int ret; 5372 int ret;
4999 5373
5000 while (level >= 0) { 5374 while (level >= 0) {
5001 cur = path->nodes[level]; 5375 if (path->slots[level] >=
5002 BUG_ON(path->slots[level] >= btrfs_header_nritems(cur)); 5376 btrfs_header_nritems(path->nodes[level]))
5377 break;
5003 5378
5004 ret = walk_down_proc(trans, root, path, wc); 5379 ret = walk_down_proc(trans, root, path, wc, lookup_info);
5005 if (ret > 0) 5380 if (ret > 0)
5006 break; 5381 break;
5007 5382
5008 if (level == 0) 5383 if (level == 0)
5009 break; 5384 break;
5010 5385
5011 bytenr = btrfs_node_blockptr(cur, path->slots[level]); 5386 ret = do_walk_down(trans, root, path, wc, &lookup_info);
5012 blocksize = btrfs_level_size(root, level - 1); 5387 if (ret > 0) {
5013 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]); 5388 path->slots[level]++;
5014 5389 continue;
5015 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 5390 }
5016 btrfs_tree_lock(next); 5391 level = wc->level;
5017 btrfs_set_lock_blocking(next);
5018
5019 level--;
5020 BUG_ON(level != btrfs_header_level(next));
5021 path->nodes[level] = next;
5022 path->slots[level] = 0;
5023 path->locks[level] = 1;
5024 wc->level = level;
5025 } 5392 }
5026 return 0; 5393 return 0;
5027} 5394}
@@ -5111,9 +5478,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5111 err = ret; 5478 err = ret;
5112 goto out; 5479 goto out;
5113 } 5480 }
5114 btrfs_node_key_to_cpu(path->nodes[level], &key, 5481 WARN_ON(ret > 0);
5115 path->slots[level]);
5116 WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
5117 5482
5118 /* 5483 /*
5119 * unlock our path, this is safe because only this 5484 * unlock our path, this is safe because only this
@@ -5148,6 +5513,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5148 wc->stage = DROP_REFERENCE; 5513 wc->stage = DROP_REFERENCE;
5149 wc->update_ref = update_ref; 5514 wc->update_ref = update_ref;
5150 wc->keep_locks = 0; 5515 wc->keep_locks = 0;
5516 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
5151 5517
5152 while (1) { 5518 while (1) {
5153 ret = walk_down_tree(trans, root, path, wc); 5519 ret = walk_down_tree(trans, root, path, wc);
@@ -5200,9 +5566,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5200 ret = btrfs_del_root(trans, tree_root, &root->root_key); 5566 ret = btrfs_del_root(trans, tree_root, &root->root_key);
5201 BUG_ON(ret); 5567 BUG_ON(ret);
5202 5568
5203 free_extent_buffer(root->node); 5569 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5204 free_extent_buffer(root->commit_root); 5570 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
5205 kfree(root); 5571 NULL, NULL);
5572 BUG_ON(ret < 0);
5573 if (ret > 0) {
5574 ret = btrfs_del_orphan_item(trans, tree_root,
5575 root->root_key.objectid);
5576 BUG_ON(ret);
5577 }
5578 }
5579
5580 if (root->in_radix) {
5581 btrfs_free_fs_root(tree_root->fs_info, root);
5582 } else {
5583 free_extent_buffer(root->node);
5584 free_extent_buffer(root->commit_root);
5585 kfree(root);
5586 }
5206out: 5587out:
5207 btrfs_end_transaction(trans, tree_root); 5588 btrfs_end_transaction(trans, tree_root);
5208 kfree(wc); 5589 kfree(wc);
@@ -5254,6 +5635,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5254 wc->stage = DROP_REFERENCE; 5635 wc->stage = DROP_REFERENCE;
5255 wc->update_ref = 0; 5636 wc->update_ref = 0;
5256 wc->keep_locks = 1; 5637 wc->keep_locks = 1;
5638 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
5257 5639
5258 while (1) { 5640 while (1) {
5259 wret = walk_down_tree(trans, root, path, wc); 5641 wret = walk_down_tree(trans, root, path, wc);
@@ -5396,9 +5778,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
5396 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); 5778 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
5397 while (1) { 5779 while (1) {
5398 int ret; 5780 int ret;
5399 spin_lock(&em_tree->lock); 5781 write_lock(&em_tree->lock);
5400 ret = add_extent_mapping(em_tree, em); 5782 ret = add_extent_mapping(em_tree, em);
5401 spin_unlock(&em_tree->lock); 5783 write_unlock(&em_tree->lock);
5402 if (ret != -EEXIST) { 5784 if (ret != -EEXIST) {
5403 free_extent_map(em); 5785 free_extent_map(em);
5404 break; 5786 break;
@@ -6841,287 +7223,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
6841 return 0; 7223 return 0;
6842} 7224}
6843 7225
6844#if 0 7226/*
6845static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 7227 * checks to see if its even possible to relocate this block group.
6846 struct btrfs_root *root, 7228 *
6847 u64 objectid, u64 size) 7229 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
6848{ 7230 * ok to go ahead and try.
6849 struct btrfs_path *path; 7231 */
6850 struct btrfs_inode_item *item; 7232int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6851 struct extent_buffer *leaf;
6852 int ret;
6853
6854 path = btrfs_alloc_path();
6855 if (!path)
6856 return -ENOMEM;
6857
6858 path->leave_spinning = 1;
6859 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
6860 if (ret)
6861 goto out;
6862
6863 leaf = path->nodes[0];
6864 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
6865 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
6866 btrfs_set_inode_generation(leaf, item, 1);
6867 btrfs_set_inode_size(leaf, item, size);
6868 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
6869 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
6870 btrfs_mark_buffer_dirty(leaf);
6871 btrfs_release_path(root, path);
6872out:
6873 btrfs_free_path(path);
6874 return ret;
6875}
6876
6877static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
6878 struct btrfs_block_group_cache *group)
6879{ 7233{
6880 struct inode *inode = NULL; 7234 struct btrfs_block_group_cache *block_group;
6881 struct btrfs_trans_handle *trans; 7235 struct btrfs_space_info *space_info;
6882 struct btrfs_root *root; 7236 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6883 struct btrfs_key root_key; 7237 struct btrfs_device *device;
6884 u64 objectid = BTRFS_FIRST_FREE_OBJECTID; 7238 int full = 0;
6885 int err = 0; 7239 int ret = 0;
6886 7240
6887 root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; 7241 block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
6888 root_key.type = BTRFS_ROOT_ITEM_KEY;
6889 root_key.offset = (u64)-1;
6890 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
6891 if (IS_ERR(root))
6892 return ERR_CAST(root);
6893 7242
6894 trans = btrfs_start_transaction(root, 1); 7243 /* odd, couldn't find the block group, leave it alone */
6895 BUG_ON(!trans); 7244 if (!block_group)
7245 return -1;
6896 7246
6897 err = btrfs_find_free_objectid(trans, root, objectid, &objectid); 7247 /* no bytes used, we're good */
6898 if (err) 7248 if (!btrfs_block_group_used(&block_group->item))
6899 goto out; 7249 goto out;
6900 7250
6901 err = __insert_orphan_inode(trans, root, objectid, group->key.offset); 7251 space_info = block_group->space_info;
6902 BUG_ON(err); 7252 spin_lock(&space_info->lock);
6903
6904 err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
6905 group->key.offset, 0, group->key.offset,
6906 0, 0, 0);
6907 BUG_ON(err);
6908
6909 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
6910 if (inode->i_state & I_NEW) {
6911 BTRFS_I(inode)->root = root;
6912 BTRFS_I(inode)->location.objectid = objectid;
6913 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
6914 BTRFS_I(inode)->location.offset = 0;
6915 btrfs_read_locked_inode(inode);
6916 unlock_new_inode(inode);
6917 BUG_ON(is_bad_inode(inode));
6918 } else {
6919 BUG_ON(1);
6920 }
6921 BTRFS_I(inode)->index_cnt = group->key.objectid;
6922
6923 err = btrfs_orphan_add(trans, inode);
6924out:
6925 btrfs_end_transaction(trans, root);
6926 if (err) {
6927 if (inode)
6928 iput(inode);
6929 inode = ERR_PTR(err);
6930 }
6931 return inode;
6932}
6933
6934int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
6935{
6936
6937 struct btrfs_ordered_sum *sums;
6938 struct btrfs_sector_sum *sector_sum;
6939 struct btrfs_ordered_extent *ordered;
6940 struct btrfs_root *root = BTRFS_I(inode)->root;
6941 struct list_head list;
6942 size_t offset;
6943 int ret;
6944 u64 disk_bytenr;
6945
6946 INIT_LIST_HEAD(&list);
6947
6948 ordered = btrfs_lookup_ordered_extent(inode, file_pos);
6949 BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
6950
6951 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
6952 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
6953 disk_bytenr + len - 1, &list);
6954
6955 while (!list_empty(&list)) {
6956 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
6957 list_del_init(&sums->list);
6958
6959 sector_sum = sums->sums;
6960 sums->bytenr = ordered->start;
6961 7253
6962 offset = 0; 7254 full = space_info->full;
6963 while (offset < sums->len) {
6964 sector_sum->bytenr += ordered->start - disk_bytenr;
6965 sector_sum++;
6966 offset += root->sectorsize;
6967 }
6968 7255
6969 btrfs_add_ordered_sum(inode, ordered, sums); 7256 /*
7257 * if this is the last block group we have in this space, we can't
7258 * relocate it unless we're able to allocate a new chunk below.
7259 *
7260 * Otherwise, we need to make sure we have room in the space to handle
7261 * all of the extents from this block group. If we can, we're good
7262 */
7263 if ((space_info->total_bytes != block_group->key.offset) &&
7264 (space_info->bytes_used + space_info->bytes_reserved +
7265 space_info->bytes_pinned + space_info->bytes_readonly +
7266 btrfs_block_group_used(&block_group->item) <
7267 space_info->total_bytes)) {
7268 spin_unlock(&space_info->lock);
7269 goto out;
6970 } 7270 }
6971 btrfs_put_ordered_extent(ordered); 7271 spin_unlock(&space_info->lock);
6972 return 0;
6973}
6974
6975int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
6976{
6977 struct btrfs_trans_handle *trans;
6978 struct btrfs_path *path;
6979 struct btrfs_fs_info *info = root->fs_info;
6980 struct extent_buffer *leaf;
6981 struct inode *reloc_inode;
6982 struct btrfs_block_group_cache *block_group;
6983 struct btrfs_key key;
6984 u64 skipped;
6985 u64 cur_byte;
6986 u64 total_found;
6987 u32 nritems;
6988 int ret;
6989 int progress;
6990 int pass = 0;
6991
6992 root = root->fs_info->extent_root;
6993
6994 block_group = btrfs_lookup_block_group(info, group_start);
6995 BUG_ON(!block_group);
6996
6997 printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
6998 (unsigned long long)block_group->key.objectid,
6999 (unsigned long long)block_group->flags);
7000
7001 path = btrfs_alloc_path();
7002 BUG_ON(!path);
7003
7004 reloc_inode = create_reloc_inode(info, block_group);
7005 BUG_ON(IS_ERR(reloc_inode));
7006
7007 __alloc_chunk_for_shrink(root, block_group, 1);
7008 set_block_group_readonly(block_group);
7009
7010 btrfs_start_delalloc_inodes(info->tree_root);
7011 btrfs_wait_ordered_extents(info->tree_root, 0);
7012again:
7013 skipped = 0;
7014 total_found = 0;
7015 progress = 0;
7016 key.objectid = block_group->key.objectid;
7017 key.offset = 0;
7018 key.type = 0;
7019 cur_byte = key.objectid;
7020
7021 trans = btrfs_start_transaction(info->tree_root, 1);
7022 btrfs_commit_transaction(trans, info->tree_root);
7023 7272
7024 mutex_lock(&root->fs_info->cleaner_mutex); 7273 /*
7025 btrfs_clean_old_snapshots(info->tree_root); 7274 * ok we don't have enough space, but maybe we have free space on our
7026 btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); 7275 * devices to allocate new chunks for relocation, so loop through our
7027 mutex_unlock(&root->fs_info->cleaner_mutex); 7276 * alloc devices and guess if we have enough space. However, if we
7277 * were marked as full, then we know there aren't enough chunks, and we
7278 * can just return.
7279 */
7280 ret = -1;
7281 if (full)
7282 goto out;
7028 7283
7029 trans = btrfs_start_transaction(info->tree_root, 1); 7284 mutex_lock(&root->fs_info->chunk_mutex);
7030 btrfs_commit_transaction(trans, info->tree_root); 7285 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7286 u64 min_free = btrfs_block_group_used(&block_group->item);
7287 u64 dev_offset, max_avail;
7031 7288
7032 while (1) { 7289 /*
7033 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7290 * check to make sure we can actually find a chunk with enough
7034 if (ret < 0) 7291 * space to fit our block group in.
7035 goto out; 7292 */
7036next: 7293 if (device->total_bytes > device->bytes_used + min_free) {
7037 leaf = path->nodes[0]; 7294 ret = find_free_dev_extent(NULL, device, min_free,
7038 nritems = btrfs_header_nritems(leaf); 7295 &dev_offset, &max_avail);
7039 if (path->slots[0] >= nritems) { 7296 if (!ret)
7040 ret = btrfs_next_leaf(root, path);
7041 if (ret < 0)
7042 goto out;
7043 if (ret == 1) {
7044 ret = 0;
7045 break; 7297 break;
7046 } 7298 ret = -1;
7047 leaf = path->nodes[0];
7048 nritems = btrfs_header_nritems(leaf);
7049 } 7299 }
7050
7051 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7052
7053 if (key.objectid >= block_group->key.objectid +
7054 block_group->key.offset)
7055 break;
7056
7057 if (progress && need_resched()) {
7058 btrfs_release_path(root, path);
7059 cond_resched();
7060 progress = 0;
7061 continue;
7062 }
7063 progress = 1;
7064
7065 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
7066 key.objectid + key.offset <= cur_byte) {
7067 path->slots[0]++;
7068 goto next;
7069 }
7070
7071 total_found++;
7072 cur_byte = key.objectid + key.offset;
7073 btrfs_release_path(root, path);
7074
7075 __alloc_chunk_for_shrink(root, block_group, 0);
7076 ret = relocate_one_extent(root, path, &key, block_group,
7077 reloc_inode, pass);
7078 BUG_ON(ret < 0);
7079 if (ret > 0)
7080 skipped++;
7081
7082 key.objectid = cur_byte;
7083 key.type = 0;
7084 key.offset = 0;
7085 }
7086
7087 btrfs_release_path(root, path);
7088
7089 if (pass == 0) {
7090 btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
7091 invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
7092 } 7300 }
7093 7301 mutex_unlock(&root->fs_info->chunk_mutex);
7094 if (total_found > 0) {
7095 printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
7096 (unsigned long long)total_found, pass);
7097 pass++;
7098 if (total_found == skipped && pass > 2) {
7099 iput(reloc_inode);
7100 reloc_inode = create_reloc_inode(info, block_group);
7101 pass = 0;
7102 }
7103 goto again;
7104 }
7105
7106 /* delete reloc_inode */
7107 iput(reloc_inode);
7108
7109 /* unpin extents in this range */
7110 trans = btrfs_start_transaction(info->tree_root, 1);
7111 btrfs_commit_transaction(trans, info->tree_root);
7112
7113 spin_lock(&block_group->lock);
7114 WARN_ON(block_group->pinned > 0);
7115 WARN_ON(block_group->reserved > 0);
7116 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
7117 spin_unlock(&block_group->lock);
7118 btrfs_put_block_group(block_group);
7119 ret = 0;
7120out: 7302out:
7121 btrfs_free_path(path); 7303 btrfs_put_block_group(block_group);
7122 return ret; 7304 return ret;
7123} 7305}
7124#endif
7125 7306
7126static int find_first_block_group(struct btrfs_root *root, 7307static int find_first_block_group(struct btrfs_root *root,
7127 struct btrfs_path *path, struct btrfs_key *key) 7308 struct btrfs_path *path, struct btrfs_key *key)
@@ -7164,8 +7345,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7164{ 7345{
7165 struct btrfs_block_group_cache *block_group; 7346 struct btrfs_block_group_cache *block_group;
7166 struct btrfs_space_info *space_info; 7347 struct btrfs_space_info *space_info;
7348 struct btrfs_caching_control *caching_ctl;
7167 struct rb_node *n; 7349 struct rb_node *n;
7168 7350
7351 down_write(&info->extent_commit_sem);
7352 while (!list_empty(&info->caching_block_groups)) {
7353 caching_ctl = list_entry(info->caching_block_groups.next,
7354 struct btrfs_caching_control, list);
7355 list_del(&caching_ctl->list);
7356 put_caching_control(caching_ctl);
7357 }
7358 up_write(&info->extent_commit_sem);
7359
7169 spin_lock(&info->block_group_cache_lock); 7360 spin_lock(&info->block_group_cache_lock);
7170 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 7361 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7171 block_group = rb_entry(n, struct btrfs_block_group_cache, 7362 block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -7179,8 +7370,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7179 up_write(&block_group->space_info->groups_sem); 7370 up_write(&block_group->space_info->groups_sem);
7180 7371
7181 if (block_group->cached == BTRFS_CACHE_STARTED) 7372 if (block_group->cached == BTRFS_CACHE_STARTED)
7182 wait_event(block_group->caching_q, 7373 wait_block_group_cache_done(block_group);
7183 block_group_cache_done(block_group));
7184 7374
7185 btrfs_remove_free_space_cache(block_group); 7375 btrfs_remove_free_space_cache(block_group);
7186 7376
@@ -7250,7 +7440,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7250 spin_lock_init(&cache->lock); 7440 spin_lock_init(&cache->lock);
7251 spin_lock_init(&cache->tree_lock); 7441 spin_lock_init(&cache->tree_lock);
7252 cache->fs_info = info; 7442 cache->fs_info = info;
7253 init_waitqueue_head(&cache->caching_q);
7254 INIT_LIST_HEAD(&cache->list); 7443 INIT_LIST_HEAD(&cache->list);
7255 INIT_LIST_HEAD(&cache->cluster_list); 7444 INIT_LIST_HEAD(&cache->cluster_list);
7256 7445
@@ -7272,8 +7461,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7272 cache->flags = btrfs_block_group_flags(&cache->item); 7461 cache->flags = btrfs_block_group_flags(&cache->item);
7273 cache->sectorsize = root->sectorsize; 7462 cache->sectorsize = root->sectorsize;
7274 7463
7275 remove_sb_from_cache(root, cache);
7276
7277 /* 7464 /*
7278 * check for two cases, either we are full, and therefore 7465 * check for two cases, either we are full, and therefore
7279 * don't need to bother with the caching work since we won't 7466 * don't need to bother with the caching work since we won't
@@ -7282,13 +7469,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7282 * time, particularly in the full case. 7469 * time, particularly in the full case.
7283 */ 7470 */
7284 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 7471 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7472 exclude_super_stripes(root, cache);
7473 cache->last_byte_to_unpin = (u64)-1;
7285 cache->cached = BTRFS_CACHE_FINISHED; 7474 cache->cached = BTRFS_CACHE_FINISHED;
7475 free_excluded_extents(root, cache);
7286 } else if (btrfs_block_group_used(&cache->item) == 0) { 7476 } else if (btrfs_block_group_used(&cache->item) == 0) {
7477 exclude_super_stripes(root, cache);
7478 cache->last_byte_to_unpin = (u64)-1;
7287 cache->cached = BTRFS_CACHE_FINISHED; 7479 cache->cached = BTRFS_CACHE_FINISHED;
7288 add_new_free_space(cache, root->fs_info, 7480 add_new_free_space(cache, root->fs_info,
7289 found_key.objectid, 7481 found_key.objectid,
7290 found_key.objectid + 7482 found_key.objectid +
7291 found_key.offset); 7483 found_key.offset);
7484 free_excluded_extents(root, cache);
7292 } 7485 }
7293 7486
7294 ret = update_space_info(info, cache->flags, found_key.offset, 7487 ret = update_space_info(info, cache->flags, found_key.offset,
@@ -7296,6 +7489,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7296 &space_info); 7489 &space_info);
7297 BUG_ON(ret); 7490 BUG_ON(ret);
7298 cache->space_info = space_info; 7491 cache->space_info = space_info;
7492 spin_lock(&cache->space_info->lock);
7493 cache->space_info->bytes_super += cache->bytes_super;
7494 spin_unlock(&cache->space_info->lock);
7495
7299 down_write(&space_info->groups_sem); 7496 down_write(&space_info->groups_sem);
7300 list_add_tail(&cache->list, &space_info->block_groups); 7497 list_add_tail(&cache->list, &space_info->block_groups);
7301 up_write(&space_info->groups_sem); 7498 up_write(&space_info->groups_sem);
@@ -7345,7 +7542,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7345 atomic_set(&cache->count, 1); 7542 atomic_set(&cache->count, 1);
7346 spin_lock_init(&cache->lock); 7543 spin_lock_init(&cache->lock);
7347 spin_lock_init(&cache->tree_lock); 7544 spin_lock_init(&cache->tree_lock);
7348 init_waitqueue_head(&cache->caching_q);
7349 INIT_LIST_HEAD(&cache->list); 7545 INIT_LIST_HEAD(&cache->list);
7350 INIT_LIST_HEAD(&cache->cluster_list); 7546 INIT_LIST_HEAD(&cache->cluster_list);
7351 7547
@@ -7354,15 +7550,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7354 cache->flags = type; 7550 cache->flags = type;
7355 btrfs_set_block_group_flags(&cache->item, type); 7551 btrfs_set_block_group_flags(&cache->item, type);
7356 7552
7553 cache->last_byte_to_unpin = (u64)-1;
7357 cache->cached = BTRFS_CACHE_FINISHED; 7554 cache->cached = BTRFS_CACHE_FINISHED;
7358 remove_sb_from_cache(root, cache); 7555 exclude_super_stripes(root, cache);
7359 7556
7360 add_new_free_space(cache, root->fs_info, chunk_offset, 7557 add_new_free_space(cache, root->fs_info, chunk_offset,
7361 chunk_offset + size); 7558 chunk_offset + size);
7362 7559
7560 free_excluded_extents(root, cache);
7561
7363 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7562 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7364 &cache->space_info); 7563 &cache->space_info);
7365 BUG_ON(ret); 7564 BUG_ON(ret);
7565
7566 spin_lock(&cache->space_info->lock);
7567 cache->space_info->bytes_super += cache->bytes_super;
7568 spin_unlock(&cache->space_info->lock);
7569
7366 down_write(&cache->space_info->groups_sem); 7570 down_write(&cache->space_info->groups_sem);
7367 list_add_tail(&cache->list, &cache->space_info->block_groups); 7571 list_add_tail(&cache->list, &cache->space_info->block_groups);
7368 up_write(&cache->space_info->groups_sem); 7572 up_write(&cache->space_info->groups_sem);
@@ -7428,8 +7632,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7428 up_write(&block_group->space_info->groups_sem); 7632 up_write(&block_group->space_info->groups_sem);
7429 7633
7430 if (block_group->cached == BTRFS_CACHE_STARTED) 7634 if (block_group->cached == BTRFS_CACHE_STARTED)
7431 wait_event(block_group->caching_q, 7635 wait_block_group_cache_done(block_group);
7432 block_group_cache_done(block_group));
7433 7636
7434 btrfs_remove_free_space_cache(block_group); 7637 btrfs_remove_free_space_cache(block_group);
7435 7638