aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/async-thread.c6
-rw-r--r--fs/btrfs/compression.c1
-rw-r--r--fs/btrfs/ctree.c121
-rw-r--r--fs/btrfs/ctree.h30
-rw-r--r--fs/btrfs/disk-io.c15
-rw-r--r--fs/btrfs/extent-tree.c1096
-rw-r--r--fs/btrfs/file.c6
-rw-r--r--fs/btrfs/free-space-cache.c1058
-rw-r--r--fs/btrfs/free-space-cache.h8
-rw-r--r--fs/btrfs/inode.c31
-rw-r--r--fs/btrfs/ioctl.c7
-rw-r--r--fs/btrfs/print-tree.c6
-rw-r--r--fs/btrfs/relocation.c17
-rw-r--r--fs/btrfs/super.c1
-rw-r--r--fs/btrfs/transaction.c60
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c46
-rw-r--r--fs/btrfs/zlib.c6
19 files changed, 1861 insertions, 657 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7f88628a1a72..019e8af449ab 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -299,8 +299,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
299 "btrfs-%s-%d", workers->name, 299 "btrfs-%s-%d", workers->name,
300 workers->num_workers + i); 300 workers->num_workers + i);
301 if (IS_ERR(worker->task)) { 301 if (IS_ERR(worker->task)) {
302 kfree(worker);
303 ret = PTR_ERR(worker->task); 302 ret = PTR_ERR(worker->task);
303 kfree(worker);
304 goto fail; 304 goto fail;
305 } 305 }
306 306
@@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work)
424 * list 424 * list
425 */ 425 */
426 if (worker->idle) { 426 if (worker->idle) {
427 spin_lock_irqsave(&worker->workers->lock, flags); 427 spin_lock(&worker->workers->lock);
428 worker->idle = 0; 428 worker->idle = 0;
429 list_move_tail(&worker->worker_list, 429 list_move_tail(&worker->worker_list,
430 &worker->workers->worker_list); 430 &worker->workers->worker_list);
431 spin_unlock_irqrestore(&worker->workers->lock, flags); 431 spin_unlock(&worker->workers->lock);
432 } 432 }
433 if (!worker->working) { 433 if (!worker->working) {
434 wake = 1; 434 wake = 1;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index de1e2fd32080..9d8ba4d54a37 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -26,7 +26,6 @@
26#include <linux/time.h> 26#include <linux/time.h>
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
31#include <linux/mpage.h> 30#include <linux/mpage.h>
32#include <linux/swap.h> 31#include <linux/swap.h>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 60a45f3a4e91..3fdcc0512d3a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
557 557
558 btrfs_disk_key_to_cpu(&k1, disk); 558 btrfs_disk_key_to_cpu(&k1, disk);
559 559
560 if (k1.objectid > k2->objectid) 560 return btrfs_comp_cpu_keys(&k1, k2);
561 return 1;
562 if (k1.objectid < k2->objectid)
563 return -1;
564 if (k1.type > k2->type)
565 return 1;
566 if (k1.type < k2->type)
567 return -1;
568 if (k1.offset > k2->offset)
569 return 1;
570 if (k1.offset < k2->offset)
571 return -1;
572 return 0;
573} 561}
574 562
575/* 563/*
@@ -1052,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1052 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1040 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1053 return 0; 1041 return 0;
1054 1042
1055 if (btrfs_header_nritems(mid) > 2)
1056 return 0;
1057
1058 if (btrfs_header_nritems(mid) < 2) 1043 if (btrfs_header_nritems(mid) < 2)
1059 err_on_enospc = 1; 1044 err_on_enospc = 1;
1060 1045
@@ -1701,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1701 struct extent_buffer *b; 1686 struct extent_buffer *b;
1702 int slot; 1687 int slot;
1703 int ret; 1688 int ret;
1689 int err;
1704 int level; 1690 int level;
1705 int lowest_unlock = 1; 1691 int lowest_unlock = 1;
1706 u8 lowest_level = 0; 1692 u8 lowest_level = 0;
@@ -1737,8 +1723,6 @@ again:
1737 p->locks[level] = 1; 1723 p->locks[level] = 1;
1738 1724
1739 if (cow) { 1725 if (cow) {
1740 int wret;
1741
1742 /* 1726 /*
1743 * if we don't really need to cow this block 1727 * if we don't really need to cow this block
1744 * then we don't want to set the path blocking, 1728 * then we don't want to set the path blocking,
@@ -1749,12 +1733,12 @@ again:
1749 1733
1750 btrfs_set_path_blocking(p); 1734 btrfs_set_path_blocking(p);
1751 1735
1752 wret = btrfs_cow_block(trans, root, b, 1736 err = btrfs_cow_block(trans, root, b,
1753 p->nodes[level + 1], 1737 p->nodes[level + 1],
1754 p->slots[level + 1], &b); 1738 p->slots[level + 1], &b);
1755 if (wret) { 1739 if (err) {
1756 free_extent_buffer(b); 1740 free_extent_buffer(b);
1757 ret = wret; 1741 ret = err;
1758 goto done; 1742 goto done;
1759 } 1743 }
1760 } 1744 }
@@ -1793,41 +1777,45 @@ cow_done:
1793 ret = bin_search(b, key, level, &slot); 1777 ret = bin_search(b, key, level, &slot);
1794 1778
1795 if (level != 0) { 1779 if (level != 0) {
1796 if (ret && slot > 0) 1780 int dec = 0;
1781 if (ret && slot > 0) {
1782 dec = 1;
1797 slot -= 1; 1783 slot -= 1;
1784 }
1798 p->slots[level] = slot; 1785 p->slots[level] = slot;
1799 ret = setup_nodes_for_search(trans, root, p, b, level, 1786 err = setup_nodes_for_search(trans, root, p, b, level,
1800 ins_len); 1787 ins_len);
1801 if (ret == -EAGAIN) 1788 if (err == -EAGAIN)
1802 goto again; 1789 goto again;
1803 else if (ret) 1790 if (err) {
1791 ret = err;
1804 goto done; 1792 goto done;
1793 }
1805 b = p->nodes[level]; 1794 b = p->nodes[level];
1806 slot = p->slots[level]; 1795 slot = p->slots[level];
1807 1796
1808 unlock_up(p, level, lowest_unlock); 1797 unlock_up(p, level, lowest_unlock);
1809 1798
1810 /* this is only true while dropping a snapshot */
1811 if (level == lowest_level) { 1799 if (level == lowest_level) {
1812 ret = 0; 1800 if (dec)
1801 p->slots[level]++;
1813 goto done; 1802 goto done;
1814 } 1803 }
1815 1804
1816 ret = read_block_for_search(trans, root, p, 1805 err = read_block_for_search(trans, root, p,
1817 &b, level, slot, key); 1806 &b, level, slot, key);
1818 if (ret == -EAGAIN) 1807 if (err == -EAGAIN)
1819 goto again; 1808 goto again;
1820 1809 if (err) {
1821 if (ret == -EIO) 1810 ret = err;
1822 goto done; 1811 goto done;
1812 }
1823 1813
1824 if (!p->skip_locking) { 1814 if (!p->skip_locking) {
1825 int lret;
1826
1827 btrfs_clear_path_blocking(p, NULL); 1815 btrfs_clear_path_blocking(p, NULL);
1828 lret = btrfs_try_spin_lock(b); 1816 err = btrfs_try_spin_lock(b);
1829 1817
1830 if (!lret) { 1818 if (!err) {
1831 btrfs_set_path_blocking(p); 1819 btrfs_set_path_blocking(p);
1832 btrfs_tree_lock(b); 1820 btrfs_tree_lock(b);
1833 btrfs_clear_path_blocking(p, b); 1821 btrfs_clear_path_blocking(p, b);
@@ -1837,16 +1825,14 @@ cow_done:
1837 p->slots[level] = slot; 1825 p->slots[level] = slot;
1838 if (ins_len > 0 && 1826 if (ins_len > 0 &&
1839 btrfs_leaf_free_space(root, b) < ins_len) { 1827 btrfs_leaf_free_space(root, b) < ins_len) {
1840 int sret;
1841
1842 btrfs_set_path_blocking(p); 1828 btrfs_set_path_blocking(p);
1843 sret = split_leaf(trans, root, key, 1829 err = split_leaf(trans, root, key,
1844 p, ins_len, ret == 0); 1830 p, ins_len, ret == 0);
1845 btrfs_clear_path_blocking(p, NULL); 1831 btrfs_clear_path_blocking(p, NULL);
1846 1832
1847 BUG_ON(sret > 0); 1833 BUG_ON(err > 0);
1848 if (sret) { 1834 if (err) {
1849 ret = sret; 1835 ret = err;
1850 goto done; 1836 goto done;
1851 } 1837 }
1852 } 1838 }
@@ -3807,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3807 } 3793 }
3808 3794
3809 /* delete the leaf if it is mostly empty */ 3795 /* delete the leaf if it is mostly empty */
3810 if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) { 3796 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
3811 /* push_leaf_left fixes the path. 3797 /* push_leaf_left fixes the path.
3812 * make sure the path still points to our leaf 3798 * make sure the path still points to our leaf
3813 * for possible call to del_ptr below 3799 * for possible call to del_ptr below
@@ -4042,10 +4028,9 @@ out:
4042 * calling this function. 4028 * calling this function.
4043 */ 4029 */
4044int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4030int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4045 struct btrfs_key *key, int lowest_level, 4031 struct btrfs_key *key, int level,
4046 int cache_only, u64 min_trans) 4032 int cache_only, u64 min_trans)
4047{ 4033{
4048 int level = lowest_level;
4049 int slot; 4034 int slot;
4050 struct extent_buffer *c; 4035 struct extent_buffer *c;
4051 4036
@@ -4058,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4058 c = path->nodes[level]; 4043 c = path->nodes[level];
4059next: 4044next:
4060 if (slot >= btrfs_header_nritems(c)) { 4045 if (slot >= btrfs_header_nritems(c)) {
4061 level++; 4046 int ret;
4062 if (level == BTRFS_MAX_LEVEL) 4047 int orig_lowest;
4048 struct btrfs_key cur_key;
4049 if (level + 1 >= BTRFS_MAX_LEVEL ||
4050 !path->nodes[level + 1])
4063 return 1; 4051 return 1;
4064 continue; 4052
4053 if (path->locks[level + 1]) {
4054 level++;
4055 continue;
4056 }
4057
4058 slot = btrfs_header_nritems(c) - 1;
4059 if (level == 0)
4060 btrfs_item_key_to_cpu(c, &cur_key, slot);
4061 else
4062 btrfs_node_key_to_cpu(c, &cur_key, slot);
4063
4064 orig_lowest = path->lowest_level;
4065 btrfs_release_path(root, path);
4066 path->lowest_level = level;
4067 ret = btrfs_search_slot(NULL, root, &cur_key, path,
4068 0, 0);
4069 path->lowest_level = orig_lowest;
4070 if (ret < 0)
4071 return ret;
4072
4073 c = path->nodes[level];
4074 slot = path->slots[level];
4075 if (ret == 0)
4076 slot++;
4077 goto next;
4065 } 4078 }
4079
4066 if (level == 0) 4080 if (level == 0)
4067 btrfs_item_key_to_cpu(c, key, slot); 4081 btrfs_item_key_to_cpu(c, key, slot);
4068 else { 4082 else {
@@ -4146,7 +4160,8 @@ again:
4146 * advance the path if there are now more items available. 4160 * advance the path if there are now more items available.
4147 */ 4161 */
4148 if (nritems > 0 && path->slots[0] < nritems - 1) { 4162 if (nritems > 0 && path->slots[0] < nritems - 1) {
4149 path->slots[0]++; 4163 if (ret == 0)
4164 path->slots[0]++;
4150 ret = 0; 4165 ret = 0;
4151 goto done; 4166 goto done;
4152 } 4167 }
@@ -4278,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root,
4278 path->slots[0]--; 4293 path->slots[0]--;
4279 4294
4280 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4295 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4281 if (found_key.type == type)
4282 return 0;
4283 if (found_key.objectid < min_objectid) 4296 if (found_key.objectid < min_objectid)
4284 break; 4297 break;
4298 if (found_key.type == type)
4299 return 0;
4285 if (found_key.objectid == min_objectid && 4300 if (found_key.objectid == min_objectid &&
4286 found_key.type < type) 4301 found_key.type < type)
4287 break; 4302 break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2779c2f5360a..837435ce84ca 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -481,7 +481,7 @@ struct btrfs_shared_data_ref {
481 481
482struct btrfs_extent_inline_ref { 482struct btrfs_extent_inline_ref {
483 u8 type; 483 u8 type;
484 u64 offset; 484 __le64 offset;
485} __attribute__ ((__packed__)); 485} __attribute__ ((__packed__));
486 486
487/* old style backrefs item */ 487/* old style backrefs item */
@@ -689,6 +689,7 @@ struct btrfs_space_info {
689 struct list_head block_groups; 689 struct list_head block_groups;
690 spinlock_t lock; 690 spinlock_t lock;
691 struct rw_semaphore groups_sem; 691 struct rw_semaphore groups_sem;
692 atomic_t caching_threads;
692}; 693};
693 694
694/* 695/*
@@ -707,6 +708,9 @@ struct btrfs_free_cluster {
707 /* first extent starting offset */ 708 /* first extent starting offset */
708 u64 window_start; 709 u64 window_start;
709 710
711 /* if this cluster simply points at a bitmap in the block group */
712 bool points_to_bitmap;
713
710 struct btrfs_block_group_cache *block_group; 714 struct btrfs_block_group_cache *block_group;
711 /* 715 /*
712 * when a cluster is allocated from a block group, we put the 716 * when a cluster is allocated from a block group, we put the
@@ -716,24 +720,37 @@ struct btrfs_free_cluster {
716 struct list_head block_group_list; 720 struct list_head block_group_list;
717}; 721};
718 722
723enum btrfs_caching_type {
724 BTRFS_CACHE_NO = 0,
725 BTRFS_CACHE_STARTED = 1,
726 BTRFS_CACHE_FINISHED = 2,
727};
728
719struct btrfs_block_group_cache { 729struct btrfs_block_group_cache {
720 struct btrfs_key key; 730 struct btrfs_key key;
721 struct btrfs_block_group_item item; 731 struct btrfs_block_group_item item;
732 struct btrfs_fs_info *fs_info;
722 spinlock_t lock; 733 spinlock_t lock;
723 struct mutex cache_mutex;
724 u64 pinned; 734 u64 pinned;
725 u64 reserved; 735 u64 reserved;
726 u64 flags; 736 u64 flags;
727 int cached; 737 u64 sectorsize;
738 int extents_thresh;
739 int free_extents;
740 int total_bitmaps;
728 int ro; 741 int ro;
729 int dirty; 742 int dirty;
730 743
744 /* cache tracking stuff */
745 wait_queue_head_t caching_q;
746 int cached;
747
731 struct btrfs_space_info *space_info; 748 struct btrfs_space_info *space_info;
732 749
733 /* free space cache stuff */ 750 /* free space cache stuff */
734 spinlock_t tree_lock; 751 spinlock_t tree_lock;
735 struct rb_root free_space_bytes;
736 struct rb_root free_space_offset; 752 struct rb_root free_space_offset;
753 u64 free_space;
737 754
738 /* block group cache stuff */ 755 /* block group cache stuff */
739 struct rb_node cache_node; 756 struct rb_node cache_node;
@@ -808,6 +825,7 @@ struct btrfs_fs_info {
808 struct mutex drop_mutex; 825 struct mutex drop_mutex;
809 struct mutex volume_mutex; 826 struct mutex volume_mutex;
810 struct mutex tree_reloc_mutex; 827 struct mutex tree_reloc_mutex;
828 struct rw_semaphore extent_commit_sem;
811 829
812 /* 830 /*
813 * this protects the ordered operations list only while we are 831 * this protects the ordered operations list only while we are
@@ -1988,6 +2006,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
1988 u64 bytes); 2006 u64 bytes);
1989void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 2007void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
1990 u64 bytes); 2008 u64 bytes);
2009void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
1991/* ctree.c */ 2010/* ctree.c */
1992int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2011int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1993 int level, int *slot); 2012 int level, int *slot);
@@ -2074,8 +2093,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2074int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2093int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2075int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2094int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2076int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2095int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2077int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root 2096int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
2078 *root);
2079int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2097int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2080 struct btrfs_root *root, 2098 struct btrfs_root *root,
2081 struct extent_buffer *node, 2099 struct extent_buffer *node,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d28d29c95f7c..e83be2e4602c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1639,6 +1639,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1639 mutex_init(&fs_info->cleaner_mutex); 1639 mutex_init(&fs_info->cleaner_mutex);
1640 mutex_init(&fs_info->volume_mutex); 1640 mutex_init(&fs_info->volume_mutex);
1641 mutex_init(&fs_info->tree_reloc_mutex); 1641 mutex_init(&fs_info->tree_reloc_mutex);
1642 init_rwsem(&fs_info->extent_commit_sem);
1642 1643
1643 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 1644 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1644 btrfs_init_free_cluster(&fs_info->data_alloc_cluster); 1645 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1799,6 +1800,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1799 btrfs_super_chunk_root(disk_super), 1800 btrfs_super_chunk_root(disk_super),
1800 blocksize, generation); 1801 blocksize, generation);
1801 BUG_ON(!chunk_root->node); 1802 BUG_ON(!chunk_root->node);
1803 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
1804 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
1805 sb->s_id);
1806 goto fail_chunk_root;
1807 }
1802 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); 1808 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1803 chunk_root->commit_root = btrfs_root_node(chunk_root); 1809 chunk_root->commit_root = btrfs_root_node(chunk_root);
1804 1810
@@ -1826,6 +1832,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1826 blocksize, generation); 1832 blocksize, generation);
1827 if (!tree_root->node) 1833 if (!tree_root->node)
1828 goto fail_chunk_root; 1834 goto fail_chunk_root;
1835 if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1836 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
1837 sb->s_id);
1838 goto fail_tree_root;
1839 }
1829 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 1840 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1830 tree_root->commit_root = btrfs_root_node(tree_root); 1841 tree_root->commit_root = btrfs_root_node(tree_root);
1831 1842
@@ -2322,6 +2333,9 @@ int close_ctree(struct btrfs_root *root)
2322 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2333 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2323 } 2334 }
2324 2335
2336 fs_info->closing = 2;
2337 smp_mb();
2338
2325 if (fs_info->delalloc_bytes) { 2339 if (fs_info->delalloc_bytes) {
2326 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 2340 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
2327 (unsigned long long)fs_info->delalloc_bytes); 2341 (unsigned long long)fs_info->delalloc_bytes);
@@ -2343,6 +2357,7 @@ int close_ctree(struct btrfs_root *root)
2343 free_extent_buffer(root->fs_info->csum_root->commit_root); 2357 free_extent_buffer(root->fs_info->csum_root->commit_root);
2344 2358
2345 btrfs_free_block_groups(root->fs_info); 2359 btrfs_free_block_groups(root->fs_info);
2360 btrfs_free_pinned_extents(root->fs_info);
2346 2361
2347 del_fs_roots(fs_info); 2362 del_fs_roots(fs_info);
2348 2363
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index edc7d208c5ce..72a2b9c28e9f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -21,6 +21,7 @@
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h>
24#include "compat.h" 25#include "compat.h"
25#include "hash.h" 26#include "hash.h"
26#include "ctree.h" 27#include "ctree.h"
@@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
61 struct btrfs_root *extent_root, u64 alloc_bytes, 62 struct btrfs_root *extent_root, u64 alloc_bytes,
62 u64 flags, int force); 63 u64 flags, int force);
63 64
65static noinline int
66block_group_cache_done(struct btrfs_block_group_cache *cache)
67{
68 smp_mb();
69 return cache->cached == BTRFS_CACHE_FINISHED;
70}
71
64static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 72static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65{ 73{
66 return (cache->flags & bits) == bits; 74 return (cache->flags & bits) == bits;
@@ -146,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
146} 154}
147 155
148/* 156/*
157 * We always set EXTENT_LOCKED for the super mirror extents so we don't
158 * overwrite them, so those bits need to be unset. Also, if we are unmounting
159 * with pinned extents still sitting there because we had a block group caching,
160 * we need to clear those now, since we are done.
161 */
162void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
163{
164 u64 start, end, last = 0;
165 int ret;
166
167 while (1) {
168 ret = find_first_extent_bit(&info->pinned_extents, last,
169 &start, &end,
170 EXTENT_LOCKED|EXTENT_DIRTY);
171 if (ret)
172 break;
173
174 clear_extent_bits(&info->pinned_extents, start, end,
175 EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
176 last = end+1;
177 }
178}
179
180static int remove_sb_from_cache(struct btrfs_root *root,
181 struct btrfs_block_group_cache *cache)
182{
183 struct btrfs_fs_info *fs_info = root->fs_info;
184 u64 bytenr;
185 u64 *logical;
186 int stripe_len;
187 int i, nr, ret;
188
189 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
190 bytenr = btrfs_sb_offset(i);
191 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
192 cache->key.objectid, bytenr,
193 0, &logical, &nr, &stripe_len);
194 BUG_ON(ret);
195 while (nr--) {
196 try_lock_extent(&fs_info->pinned_extents,
197 logical[nr],
198 logical[nr] + stripe_len - 1, GFP_NOFS);
199 }
200 kfree(logical);
201 }
202
203 return 0;
204}
205
206/*
149 * this is only called by cache_block_group, since we could have freed extents 207 * this is only called by cache_block_group, since we could have freed extents
150 * we need to check the pinned_extents for any extents that can't be used yet 208 * we need to check the pinned_extents for any extents that can't be used yet
151 * since their free space will be released as soon as the transaction commits. 209 * since their free space will be released as soon as the transaction commits.
152 */ 210 */
153static int add_new_free_space(struct btrfs_block_group_cache *block_group, 211static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
154 struct btrfs_fs_info *info, u64 start, u64 end) 212 struct btrfs_fs_info *info, u64 start, u64 end)
155{ 213{
156 u64 extent_start, extent_end, size; 214 u64 extent_start, extent_end, size, total_added = 0;
157 int ret; 215 int ret;
158 216
159 while (start < end) { 217 while (start < end) {
160 ret = find_first_extent_bit(&info->pinned_extents, start, 218 ret = find_first_extent_bit(&info->pinned_extents, start,
161 &extent_start, &extent_end, 219 &extent_start, &extent_end,
162 EXTENT_DIRTY); 220 EXTENT_DIRTY|EXTENT_LOCKED);
163 if (ret) 221 if (ret)
164 break; 222 break;
165 223
@@ -167,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
167 start = extent_end + 1; 225 start = extent_end + 1;
168 } else if (extent_start > start && extent_start < end) { 226 } else if (extent_start > start && extent_start < end) {
169 size = extent_start - start; 227 size = extent_start - start;
228 total_added += size;
170 ret = btrfs_add_free_space(block_group, start, 229 ret = btrfs_add_free_space(block_group, start,
171 size); 230 size);
172 BUG_ON(ret); 231 BUG_ON(ret);
@@ -178,84 +237,93 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
178 237
179 if (start < end) { 238 if (start < end) {
180 size = end - start; 239 size = end - start;
240 total_added += size;
181 ret = btrfs_add_free_space(block_group, start, size); 241 ret = btrfs_add_free_space(block_group, start, size);
182 BUG_ON(ret); 242 BUG_ON(ret);
183 } 243 }
184 244
185 return 0; 245 return total_added;
186} 246}
187 247
188static int remove_sb_from_cache(struct btrfs_root *root, 248static int caching_kthread(void *data)
189 struct btrfs_block_group_cache *cache)
190{
191 u64 bytenr;
192 u64 *logical;
193 int stripe_len;
194 int i, nr, ret;
195
196 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
197 bytenr = btrfs_sb_offset(i);
198 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
199 cache->key.objectid, bytenr, 0,
200 &logical, &nr, &stripe_len);
201 BUG_ON(ret);
202 while (nr--) {
203 btrfs_remove_free_space(cache, logical[nr],
204 stripe_len);
205 }
206 kfree(logical);
207 }
208 return 0;
209}
210
211static int cache_block_group(struct btrfs_root *root,
212 struct btrfs_block_group_cache *block_group)
213{ 249{
250 struct btrfs_block_group_cache *block_group = data;
251 struct btrfs_fs_info *fs_info = block_group->fs_info;
252 u64 last = 0;
214 struct btrfs_path *path; 253 struct btrfs_path *path;
215 int ret = 0; 254 int ret = 0;
216 struct btrfs_key key; 255 struct btrfs_key key;
217 struct extent_buffer *leaf; 256 struct extent_buffer *leaf;
218 int slot; 257 int slot;
219 u64 last; 258 u64 total_found = 0;
220 259
221 if (!block_group) 260 BUG_ON(!fs_info);
222 return 0;
223
224 root = root->fs_info->extent_root;
225
226 if (block_group->cached)
227 return 0;
228 261
229 path = btrfs_alloc_path(); 262 path = btrfs_alloc_path();
230 if (!path) 263 if (!path)
231 return -ENOMEM; 264 return -ENOMEM;
232 265
233 path->reada = 2; 266 atomic_inc(&block_group->space_info->caching_threads);
267 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
234 /* 268 /*
235 * we get into deadlocks with paths held by callers of this function. 269 * We don't want to deadlock with somebody trying to allocate a new
236 * since the alloc_mutex is protecting things right now, just 270 * extent for the extent root while also trying to search the extent
237 * skip the locking here 271 * root to add free space. So we skip locking and search the commit
272 * root, since its read-only
238 */ 273 */
239 path->skip_locking = 1; 274 path->skip_locking = 1;
240 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 275 path->search_commit_root = 1;
276 path->reada = 2;
277
241 key.objectid = last; 278 key.objectid = last;
242 key.offset = 0; 279 key.offset = 0;
243 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 280 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
244 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 281again:
282 /* need to make sure the commit_root doesn't disappear */
283 down_read(&fs_info->extent_commit_sem);
284
285 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
245 if (ret < 0) 286 if (ret < 0)
246 goto err; 287 goto err;
247 288
248 while (1) { 289 while (1) {
290 smp_mb();
291 if (block_group->fs_info->closing > 1) {
292 last = (u64)-1;
293 break;
294 }
295
249 leaf = path->nodes[0]; 296 leaf = path->nodes[0];
250 slot = path->slots[0]; 297 slot = path->slots[0];
251 if (slot >= btrfs_header_nritems(leaf)) { 298 if (slot >= btrfs_header_nritems(leaf)) {
252 ret = btrfs_next_leaf(root, path); 299 ret = btrfs_next_leaf(fs_info->extent_root, path);
253 if (ret < 0) 300 if (ret < 0)
254 goto err; 301 goto err;
255 if (ret == 0) 302 else if (ret)
256 continue;
257 else
258 break; 303 break;
304
305 if (need_resched() ||
306 btrfs_transaction_in_commit(fs_info)) {
307 leaf = path->nodes[0];
308
309 /* this shouldn't happen, but if the
310 * leaf is empty just move on.
311 */
312 if (btrfs_header_nritems(leaf) == 0)
313 break;
314 /*
315 * we need to copy the key out so that
316 * we are sure the next search advances
317 * us forward in the btree.
318 */
319 btrfs_item_key_to_cpu(leaf, &key, 0);
320 btrfs_release_path(fs_info->extent_root, path);
321 up_read(&fs_info->extent_commit_sem);
322 schedule_timeout(1);
323 goto again;
324 }
325
326 continue;
259 } 327 }
260 btrfs_item_key_to_cpu(leaf, &key, slot); 328 btrfs_item_key_to_cpu(leaf, &key, slot);
261 if (key.objectid < block_group->key.objectid) 329 if (key.objectid < block_group->key.objectid)
@@ -266,24 +334,59 @@ static int cache_block_group(struct btrfs_root *root,
266 break; 334 break;
267 335
268 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { 336 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
269 add_new_free_space(block_group, root->fs_info, last, 337 total_found += add_new_free_space(block_group,
270 key.objectid); 338 fs_info, last,
271 339 key.objectid);
272 last = key.objectid + key.offset; 340 last = key.objectid + key.offset;
273 } 341 }
342
343 if (total_found > (1024 * 1024 * 2)) {
344 total_found = 0;
345 wake_up(&block_group->caching_q);
346 }
274next: 347next:
275 path->slots[0]++; 348 path->slots[0]++;
276 } 349 }
350 ret = 0;
277 351
278 add_new_free_space(block_group, root->fs_info, last, 352 total_found += add_new_free_space(block_group, fs_info, last,
279 block_group->key.objectid + 353 block_group->key.objectid +
280 block_group->key.offset); 354 block_group->key.offset);
355
356 spin_lock(&block_group->lock);
357 block_group->cached = BTRFS_CACHE_FINISHED;
358 spin_unlock(&block_group->lock);
281 359
282 block_group->cached = 1;
283 remove_sb_from_cache(root, block_group);
284 ret = 0;
285err: 360err:
286 btrfs_free_path(path); 361 btrfs_free_path(path);
362 up_read(&fs_info->extent_commit_sem);
363 atomic_dec(&block_group->space_info->caching_threads);
364 wake_up(&block_group->caching_q);
365
366 return 0;
367}
368
369static int cache_block_group(struct btrfs_block_group_cache *cache)
370{
371 struct task_struct *tsk;
372 int ret = 0;
373
374 spin_lock(&cache->lock);
375 if (cache->cached != BTRFS_CACHE_NO) {
376 spin_unlock(&cache->lock);
377 return ret;
378 }
379 cache->cached = BTRFS_CACHE_STARTED;
380 spin_unlock(&cache->lock);
381
382 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
383 cache->key.objectid);
384 if (IS_ERR(tsk)) {
385 ret = PTR_ERR(tsk);
386 printk(KERN_ERR "error running thread %d\n", ret);
387 BUG();
388 }
389
287 return ret; 390 return ret;
288} 391}
289 392
@@ -990,15 +1093,13 @@ static inline int extent_ref_type(u64 parent, u64 owner)
990 return type; 1093 return type;
991} 1094}
992 1095
993static int find_next_key(struct btrfs_path *path, struct btrfs_key *key) 1096static int find_next_key(struct btrfs_path *path, int level,
1097 struct btrfs_key *key)
994 1098
995{ 1099{
996 int level; 1100 for (; level < BTRFS_MAX_LEVEL; level++) {
997 BUG_ON(!path->keep_locks);
998 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
999 if (!path->nodes[level]) 1101 if (!path->nodes[level])
1000 break; 1102 break;
1001 btrfs_assert_tree_locked(path->nodes[level]);
1002 if (path->slots[level] + 1 >= 1103 if (path->slots[level] + 1 >=
1003 btrfs_header_nritems(path->nodes[level])) 1104 btrfs_header_nritems(path->nodes[level]))
1004 continue; 1105 continue;
@@ -1158,7 +1259,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1158 * For simplicity, we just do not add new inline back 1259 * For simplicity, we just do not add new inline back
1159 * ref if there is any kind of item for this block 1260 * ref if there is any kind of item for this block
1160 */ 1261 */
1161 if (find_next_key(path, &key) == 0 && key.objectid == bytenr && 1262 if (find_next_key(path, 0, &key) == 0 &&
1263 key.objectid == bytenr &&
1162 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1264 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1163 err = -EAGAIN; 1265 err = -EAGAIN;
1164 goto out; 1266 goto out;
@@ -2388,13 +2490,29 @@ fail:
2388 2490
2389} 2491}
2390 2492
2493static struct btrfs_block_group_cache *
2494next_block_group(struct btrfs_root *root,
2495 struct btrfs_block_group_cache *cache)
2496{
2497 struct rb_node *node;
2498 spin_lock(&root->fs_info->block_group_cache_lock);
2499 node = rb_next(&cache->cache_node);
2500 btrfs_put_block_group(cache);
2501 if (node) {
2502 cache = rb_entry(node, struct btrfs_block_group_cache,
2503 cache_node);
2504 atomic_inc(&cache->count);
2505 } else
2506 cache = NULL;
2507 spin_unlock(&root->fs_info->block_group_cache_lock);
2508 return cache;
2509}
2510
2391int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2511int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2392 struct btrfs_root *root) 2512 struct btrfs_root *root)
2393{ 2513{
2394 struct btrfs_block_group_cache *cache, *entry; 2514 struct btrfs_block_group_cache *cache;
2395 struct rb_node *n;
2396 int err = 0; 2515 int err = 0;
2397 int werr = 0;
2398 struct btrfs_path *path; 2516 struct btrfs_path *path;
2399 u64 last = 0; 2517 u64 last = 0;
2400 2518
@@ -2403,39 +2521,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2403 return -ENOMEM; 2521 return -ENOMEM;
2404 2522
2405 while (1) { 2523 while (1) {
2406 cache = NULL; 2524 if (last == 0) {
2407 spin_lock(&root->fs_info->block_group_cache_lock); 2525 err = btrfs_run_delayed_refs(trans, root,
2408 for (n = rb_first(&root->fs_info->block_group_cache_tree); 2526 (unsigned long)-1);
2409 n; n = rb_next(n)) { 2527 BUG_ON(err);
2410 entry = rb_entry(n, struct btrfs_block_group_cache,
2411 cache_node);
2412 if (entry->dirty) {
2413 cache = entry;
2414 break;
2415 }
2416 } 2528 }
2417 spin_unlock(&root->fs_info->block_group_cache_lock);
2418 2529
2419 if (!cache) 2530 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2420 break; 2531 while (cache) {
2532 if (cache->dirty)
2533 break;
2534 cache = next_block_group(root, cache);
2535 }
2536 if (!cache) {
2537 if (last == 0)
2538 break;
2539 last = 0;
2540 continue;
2541 }
2421 2542
2422 cache->dirty = 0; 2543 cache->dirty = 0;
2423 last += cache->key.offset; 2544 last = cache->key.objectid + cache->key.offset;
2424 2545
2425 err = write_one_cache_group(trans, root, 2546 err = write_one_cache_group(trans, root, path, cache);
2426 path, cache); 2547 BUG_ON(err);
2427 /* 2548 btrfs_put_block_group(cache);
2428 * if we fail to write the cache group, we want
2429 * to keep it marked dirty in hopes that a later
2430 * write will work
2431 */
2432 if (err) {
2433 werr = err;
2434 continue;
2435 }
2436 } 2549 }
2550
2437 btrfs_free_path(path); 2551 btrfs_free_path(path);
2438 return werr; 2552 return 0;
2439} 2553}
2440 2554
2441int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 2555int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -2485,6 +2599,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2485 found->force_alloc = 0; 2599 found->force_alloc = 0;
2486 *space_info = found; 2600 *space_info = found;
2487 list_add_rcu(&found->list, &info->space_info); 2601 list_add_rcu(&found->list, &info->space_info);
2602 atomic_set(&found->caching_threads, 0);
2488 return 0; 2603 return 0;
2489} 2604}
2490 2605
@@ -2697,7 +2812,7 @@ again:
2697 2812
2698 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 2813 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
2699 ", %llu bytes_used, %llu bytes_reserved, " 2814 ", %llu bytes_used, %llu bytes_reserved, "
2700 "%llu bytes_pinned, %llu bytes_readonly, %llu may use" 2815 "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
2701 "%llu total\n", (unsigned long long)bytes, 2816 "%llu total\n", (unsigned long long)bytes,
2702 (unsigned long long)data_sinfo->bytes_delalloc, 2817 (unsigned long long)data_sinfo->bytes_delalloc,
2703 (unsigned long long)data_sinfo->bytes_used, 2818 (unsigned long long)data_sinfo->bytes_used,
@@ -2948,13 +3063,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2948 struct btrfs_block_group_cache *cache; 3063 struct btrfs_block_group_cache *cache;
2949 struct btrfs_fs_info *fs_info = root->fs_info; 3064 struct btrfs_fs_info *fs_info = root->fs_info;
2950 3065
2951 if (pin) { 3066 if (pin)
2952 set_extent_dirty(&fs_info->pinned_extents, 3067 set_extent_dirty(&fs_info->pinned_extents,
2953 bytenr, bytenr + num - 1, GFP_NOFS); 3068 bytenr, bytenr + num - 1, GFP_NOFS);
2954 } else {
2955 clear_extent_dirty(&fs_info->pinned_extents,
2956 bytenr, bytenr + num - 1, GFP_NOFS);
2957 }
2958 3069
2959 while (num > 0) { 3070 while (num > 0) {
2960 cache = btrfs_lookup_block_group(fs_info, bytenr); 3071 cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2970,14 +3081,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2970 spin_unlock(&cache->space_info->lock); 3081 spin_unlock(&cache->space_info->lock);
2971 fs_info->total_pinned += len; 3082 fs_info->total_pinned += len;
2972 } else { 3083 } else {
3084 int unpin = 0;
3085
3086 /*
3087 * in order to not race with the block group caching, we
3088 * only want to unpin the extent if we are cached. If
3089 * we aren't cached, we want to start async caching this
3090 * block group so we can free the extent the next time
3091 * around.
3092 */
2973 spin_lock(&cache->space_info->lock); 3093 spin_lock(&cache->space_info->lock);
2974 spin_lock(&cache->lock); 3094 spin_lock(&cache->lock);
2975 cache->pinned -= len; 3095 unpin = (cache->cached == BTRFS_CACHE_FINISHED);
2976 cache->space_info->bytes_pinned -= len; 3096 if (likely(unpin)) {
3097 cache->pinned -= len;
3098 cache->space_info->bytes_pinned -= len;
3099 fs_info->total_pinned -= len;
3100 }
2977 spin_unlock(&cache->lock); 3101 spin_unlock(&cache->lock);
2978 spin_unlock(&cache->space_info->lock); 3102 spin_unlock(&cache->space_info->lock);
2979 fs_info->total_pinned -= len; 3103
2980 if (cache->cached) 3104 if (likely(unpin))
3105 clear_extent_dirty(&fs_info->pinned_extents,
3106 bytenr, bytenr + len -1,
3107 GFP_NOFS);
3108 else
3109 cache_block_group(cache);
3110
3111 if (unpin)
2981 btrfs_add_free_space(cache, bytenr, len); 3112 btrfs_add_free_space(cache, bytenr, len);
2982 } 3113 }
2983 btrfs_put_block_group(cache); 3114 btrfs_put_block_group(cache);
@@ -3031,6 +3162,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
3031 &start, &end, EXTENT_DIRTY); 3162 &start, &end, EXTENT_DIRTY);
3032 if (ret) 3163 if (ret)
3033 break; 3164 break;
3165
3034 set_extent_dirty(copy, start, end, GFP_NOFS); 3166 set_extent_dirty(copy, start, end, GFP_NOFS);
3035 last = end + 1; 3167 last = end + 1;
3036 } 3168 }
@@ -3059,6 +3191,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3059 3191
3060 cond_resched(); 3192 cond_resched();
3061 } 3193 }
3194
3062 return ret; 3195 return ret;
3063} 3196}
3064 3197
@@ -3437,6 +3570,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
3437} 3570}
3438 3571
3439/* 3572/*
3573 * when we wait for progress in the block group caching, its because
3574 * our allocation attempt failed at least once. So, we must sleep
3575 * and let some progress happen before we try again.
3576 *
3577 * This function will sleep at least once waiting for new free space to
3578 * show up, and then it will check the block group free space numbers
3579 * for our min num_bytes. Another option is to have it go ahead
3580 * and look in the rbtree for a free extent of a given size, but this
3581 * is a good start.
3582 */
3583static noinline int
3584wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
3585 u64 num_bytes)
3586{
3587 DEFINE_WAIT(wait);
3588
3589 prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
3590
3591 if (block_group_cache_done(cache)) {
3592 finish_wait(&cache->caching_q, &wait);
3593 return 0;
3594 }
3595 schedule();
3596 finish_wait(&cache->caching_q, &wait);
3597
3598 wait_event(cache->caching_q, block_group_cache_done(cache) ||
3599 (cache->free_space >= num_bytes));
3600 return 0;
3601}
3602
3603enum btrfs_loop_type {
3604 LOOP_CACHED_ONLY = 0,
3605 LOOP_CACHING_NOWAIT = 1,
3606 LOOP_CACHING_WAIT = 2,
3607 LOOP_ALLOC_CHUNK = 3,
3608 LOOP_NO_EMPTY_SIZE = 4,
3609};
3610
3611/*
3440 * walks the btree of allocated extents and find a hole of a given size. 3612 * walks the btree of allocated extents and find a hole of a given size.
3441 * The key ins is changed to record the hole: 3613 * The key ins is changed to record the hole:
3442 * ins->objectid == block start 3614 * ins->objectid == block start
@@ -3461,6 +3633,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3461 struct btrfs_space_info *space_info; 3633 struct btrfs_space_info *space_info;
3462 int last_ptr_loop = 0; 3634 int last_ptr_loop = 0;
3463 int loop = 0; 3635 int loop = 0;
3636 bool found_uncached_bg = false;
3464 3637
3465 WARN_ON(num_bytes < root->sectorsize); 3638 WARN_ON(num_bytes < root->sectorsize);
3466 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 3639 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3492,15 +3665,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3492 search_start = max(search_start, first_logical_byte(root, 0)); 3665 search_start = max(search_start, first_logical_byte(root, 0));
3493 search_start = max(search_start, hint_byte); 3666 search_start = max(search_start, hint_byte);
3494 3667
3495 if (!last_ptr) { 3668 if (!last_ptr)
3496 empty_cluster = 0; 3669 empty_cluster = 0;
3497 loop = 1;
3498 }
3499 3670
3500 if (search_start == hint_byte) { 3671 if (search_start == hint_byte) {
3501 block_group = btrfs_lookup_block_group(root->fs_info, 3672 block_group = btrfs_lookup_block_group(root->fs_info,
3502 search_start); 3673 search_start);
3503 if (block_group && block_group_bits(block_group, data)) { 3674 /*
3675 * we don't want to use the block group if it doesn't match our
3676 * allocation bits, or if its not cached.
3677 */
3678 if (block_group && block_group_bits(block_group, data) &&
3679 block_group_cache_done(block_group)) {
3504 down_read(&space_info->groups_sem); 3680 down_read(&space_info->groups_sem);
3505 if (list_empty(&block_group->list) || 3681 if (list_empty(&block_group->list) ||
3506 block_group->ro) { 3682 block_group->ro) {
@@ -3523,21 +3699,35 @@ search:
3523 down_read(&space_info->groups_sem); 3699 down_read(&space_info->groups_sem);
3524 list_for_each_entry(block_group, &space_info->block_groups, list) { 3700 list_for_each_entry(block_group, &space_info->block_groups, list) {
3525 u64 offset; 3701 u64 offset;
3702 int cached;
3526 3703
3527 atomic_inc(&block_group->count); 3704 atomic_inc(&block_group->count);
3528 search_start = block_group->key.objectid; 3705 search_start = block_group->key.objectid;
3529 3706
3530have_block_group: 3707have_block_group:
3531 if (unlikely(!block_group->cached)) { 3708 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
3532 mutex_lock(&block_group->cache_mutex); 3709 /*
3533 ret = cache_block_group(root, block_group); 3710 * we want to start caching kthreads, but not too many
3534 mutex_unlock(&block_group->cache_mutex); 3711 * right off the bat so we don't overwhelm the system,
3535 if (ret) { 3712 * so only start them if there are less than 2 and we're
3536 btrfs_put_block_group(block_group); 3713 * in the initial allocation phase.
3537 break; 3714 */
3715 if (loop > LOOP_CACHING_NOWAIT ||
3716 atomic_read(&space_info->caching_threads) < 2) {
3717 ret = cache_block_group(block_group);
3718 BUG_ON(ret);
3538 } 3719 }
3539 } 3720 }
3540 3721
3722 cached = block_group_cache_done(block_group);
3723 if (unlikely(!cached)) {
3724 found_uncached_bg = true;
3725
3726 /* if we only want cached bgs, loop */
3727 if (loop == LOOP_CACHED_ONLY)
3728 goto loop;
3729 }
3730
3541 if (unlikely(block_group->ro)) 3731 if (unlikely(block_group->ro))
3542 goto loop; 3732 goto loop;
3543 3733
@@ -3616,14 +3806,21 @@ refill_cluster:
3616 spin_unlock(&last_ptr->refill_lock); 3806 spin_unlock(&last_ptr->refill_lock);
3617 goto checks; 3807 goto checks;
3618 } 3808 }
3809 } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
3810 spin_unlock(&last_ptr->refill_lock);
3811
3812 wait_block_group_cache_progress(block_group,
3813 num_bytes + empty_cluster + empty_size);
3814 goto have_block_group;
3619 } 3815 }
3816
3620 /* 3817 /*
3621 * at this point we either didn't find a cluster 3818 * at this point we either didn't find a cluster
3622 * or we weren't able to allocate a block from our 3819 * or we weren't able to allocate a block from our
3623 * cluster. Free the cluster we've been trying 3820 * cluster. Free the cluster we've been trying
3624 * to use, and go to the next block group 3821 * to use, and go to the next block group
3625 */ 3822 */
3626 if (loop < 2) { 3823 if (loop < LOOP_NO_EMPTY_SIZE) {
3627 btrfs_return_cluster_to_free_space(NULL, 3824 btrfs_return_cluster_to_free_space(NULL,
3628 last_ptr); 3825 last_ptr);
3629 spin_unlock(&last_ptr->refill_lock); 3826 spin_unlock(&last_ptr->refill_lock);
@@ -3634,11 +3831,17 @@ refill_cluster:
3634 3831
3635 offset = btrfs_find_space_for_alloc(block_group, search_start, 3832 offset = btrfs_find_space_for_alloc(block_group, search_start,
3636 num_bytes, empty_size); 3833 num_bytes, empty_size);
3637 if (!offset) 3834 if (!offset && (cached || (!cached &&
3835 loop == LOOP_CACHING_NOWAIT))) {
3638 goto loop; 3836 goto loop;
3837 } else if (!offset && (!cached &&
3838 loop > LOOP_CACHING_NOWAIT)) {
3839 wait_block_group_cache_progress(block_group,
3840 num_bytes + empty_size);
3841 goto have_block_group;
3842 }
3639checks: 3843checks:
3640 search_start = stripe_align(root, offset); 3844 search_start = stripe_align(root, offset);
3641
3642 /* move on to the next group */ 3845 /* move on to the next group */
3643 if (search_start + num_bytes >= search_end) { 3846 if (search_start + num_bytes >= search_end) {
3644 btrfs_add_free_space(block_group, offset, num_bytes); 3847 btrfs_add_free_space(block_group, offset, num_bytes);
@@ -3684,13 +3887,26 @@ loop:
3684 } 3887 }
3685 up_read(&space_info->groups_sem); 3888 up_read(&space_info->groups_sem);
3686 3889
3687 /* loop == 0, try to find a clustered alloc in every block group 3890 /* LOOP_CACHED_ONLY, only search fully cached block groups
3688 * loop == 1, try again after forcing a chunk allocation 3891 * LOOP_CACHING_NOWAIT, search partially cached block groups, but
3689 * loop == 2, set empty_size and empty_cluster to 0 and try again 3892 * dont wait foR them to finish caching
3893 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3894 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3895 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
3896 * again
3690 */ 3897 */
3691 if (!ins->objectid && loop < 3 && 3898 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
3692 (empty_size || empty_cluster || allowed_chunk_alloc)) { 3899 (found_uncached_bg || empty_size || empty_cluster ||
3693 if (loop >= 2) { 3900 allowed_chunk_alloc)) {
3901 if (found_uncached_bg) {
3902 found_uncached_bg = false;
3903 if (loop < LOOP_CACHING_WAIT) {
3904 loop++;
3905 goto search;
3906 }
3907 }
3908
3909 if (loop == LOOP_ALLOC_CHUNK) {
3694 empty_size = 0; 3910 empty_size = 0;
3695 empty_cluster = 0; 3911 empty_cluster = 0;
3696 } 3912 }
@@ -3703,7 +3919,7 @@ loop:
3703 space_info->force_alloc = 1; 3919 space_info->force_alloc = 1;
3704 } 3920 }
3705 3921
3706 if (loop < 3) { 3922 if (loop < LOOP_NO_EMPTY_SIZE) {
3707 loop++; 3923 loop++;
3708 goto search; 3924 goto search;
3709 } 3925 }
@@ -3799,7 +4015,7 @@ again:
3799 num_bytes, data, 1); 4015 num_bytes, data, 1);
3800 goto again; 4016 goto again;
3801 } 4017 }
3802 if (ret) { 4018 if (ret == -ENOSPC) {
3803 struct btrfs_space_info *sinfo; 4019 struct btrfs_space_info *sinfo;
3804 4020
3805 sinfo = __find_space_info(root->fs_info, data); 4021 sinfo = __find_space_info(root->fs_info, data);
@@ -3807,7 +4023,6 @@ again:
3807 "wanted %llu\n", (unsigned long long)data, 4023 "wanted %llu\n", (unsigned long long)data,
3808 (unsigned long long)num_bytes); 4024 (unsigned long long)num_bytes);
3809 dump_space_info(sinfo, num_bytes); 4025 dump_space_info(sinfo, num_bytes);
3810 BUG();
3811 } 4026 }
3812 4027
3813 return ret; 4028 return ret;
@@ -3845,7 +4060,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3845 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, 4060 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3846 empty_size, hint_byte, search_end, ins, 4061 empty_size, hint_byte, search_end, ins,
3847 data); 4062 data);
3848 update_reserved_extents(root, ins->objectid, ins->offset, 1); 4063 if (!ret)
4064 update_reserved_extents(root, ins->objectid, ins->offset, 1);
4065
3849 return ret; 4066 return ret;
3850} 4067}
3851 4068
@@ -4007,9 +4224,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4007 struct btrfs_block_group_cache *block_group; 4224 struct btrfs_block_group_cache *block_group;
4008 4225
4009 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 4226 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
4010 mutex_lock(&block_group->cache_mutex); 4227 cache_block_group(block_group);
4011 cache_block_group(root, block_group); 4228 wait_event(block_group->caching_q,
4012 mutex_unlock(&block_group->cache_mutex); 4229 block_group_cache_done(block_group));
4013 4230
4014 ret = btrfs_remove_free_space(block_group, ins->objectid, 4231 ret = btrfs_remove_free_space(block_group, ins->objectid,
4015 ins->offset); 4232 ins->offset);
@@ -4040,7 +4257,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
4040 ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, 4257 ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4041 empty_size, hint_byte, search_end, 4258 empty_size, hint_byte, search_end,
4042 ins, 0); 4259 ins, 0);
4043 BUG_ON(ret); 4260 if (ret)
4261 return ret;
4044 4262
4045 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 4263 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4046 if (parent == 0) 4264 if (parent == 0)
@@ -4128,6 +4346,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4128 return buf; 4346 return buf;
4129} 4347}
4130 4348
4349#if 0
4131int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 4350int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
4132 struct btrfs_root *root, struct extent_buffer *leaf) 4351 struct btrfs_root *root, struct extent_buffer *leaf)
4133{ 4352{
@@ -4171,8 +4390,6 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
4171 return 0; 4390 return 0;
4172} 4391}
4173 4392
4174#if 0
4175
4176static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, 4393static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
4177 struct btrfs_root *root, 4394 struct btrfs_root *root,
4178 struct btrfs_leaf_ref *ref) 4395 struct btrfs_leaf_ref *ref)
@@ -4553,262 +4770,471 @@ out:
4553} 4770}
4554#endif 4771#endif
4555 4772
4773struct walk_control {
4774 u64 refs[BTRFS_MAX_LEVEL];
4775 u64 flags[BTRFS_MAX_LEVEL];
4776 struct btrfs_key update_progress;
4777 int stage;
4778 int level;
4779 int shared_level;
4780 int update_ref;
4781 int keep_locks;
4782};
4783
4784#define DROP_REFERENCE 1
4785#define UPDATE_BACKREF 2
4786
4556/* 4787/*
4557 * helper function for drop_subtree, this function is similar to 4788 * hepler to process tree block while walking down the tree.
4558 * walk_down_tree. The main difference is that it checks reference 4789 *
4559 * counts while tree blocks are locked. 4790 * when wc->stage == DROP_REFERENCE, this function checks
4791 * reference count of the block. if the block is shared and
4792 * we need update back refs for the subtree rooted at the
4793 * block, this function changes wc->stage to UPDATE_BACKREF
4794 *
4795 * when wc->stage == UPDATE_BACKREF, this function updates
4796 * back refs for pointers in the block.
4797 *
4798 * NOTE: return value 1 means we should stop walking down.
4560 */ 4799 */
4561static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 4800static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4562 struct btrfs_root *root, 4801 struct btrfs_root *root,
4563 struct btrfs_path *path, int *level) 4802 struct btrfs_path *path,
4803 struct walk_control *wc)
4564{ 4804{
4565 struct extent_buffer *next; 4805 int level = wc->level;
4566 struct extent_buffer *cur; 4806 struct extent_buffer *eb = path->nodes[level];
4567 struct extent_buffer *parent; 4807 struct btrfs_key key;
4568 u64 bytenr; 4808 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
4569 u64 ptr_gen;
4570 u64 refs;
4571 u64 flags;
4572 u32 blocksize;
4573 int ret; 4809 int ret;
4574 4810
4575 cur = path->nodes[*level]; 4811 if (wc->stage == UPDATE_BACKREF &&
4576 ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len, 4812 btrfs_header_owner(eb) != root->root_key.objectid)
4577 &refs, &flags); 4813 return 1;
4578 BUG_ON(ret);
4579 if (refs > 1)
4580 goto out;
4581 4814
4582 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 4815 /*
4816 * when reference count of tree block is 1, it won't increase
4817 * again. once full backref flag is set, we never clear it.
4818 */
4819 if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
4820 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
4821 BUG_ON(!path->locks[level]);
4822 ret = btrfs_lookup_extent_info(trans, root,
4823 eb->start, eb->len,
4824 &wc->refs[level],
4825 &wc->flags[level]);
4826 BUG_ON(ret);
4827 BUG_ON(wc->refs[level] == 0);
4828 }
4583 4829
4584 while (*level >= 0) { 4830 if (wc->stage == DROP_REFERENCE &&
4585 cur = path->nodes[*level]; 4831 wc->update_ref && wc->refs[level] > 1) {
4586 if (*level == 0) { 4832 BUG_ON(eb == root->node);
4587 ret = btrfs_drop_leaf_ref(trans, root, cur); 4833 BUG_ON(path->slots[level] > 0);
4588 BUG_ON(ret); 4834 if (level == 0)
4589 clean_tree_block(trans, root, cur); 4835 btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
4590 break; 4836 else
4591 } 4837 btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
4592 if (path->slots[*level] >= btrfs_header_nritems(cur)) { 4838 if (btrfs_header_owner(eb) == root->root_key.objectid &&
4593 clean_tree_block(trans, root, cur); 4839 btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
4594 break; 4840 wc->stage = UPDATE_BACKREF;
4841 wc->shared_level = level;
4595 } 4842 }
4843 }
4596 4844
4597 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 4845 if (wc->stage == DROP_REFERENCE) {
4598 blocksize = btrfs_level_size(root, *level - 1); 4846 if (wc->refs[level] > 1)
4599 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 4847 return 1;
4600 4848
4601 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 4849 if (path->locks[level] && !wc->keep_locks) {
4602 btrfs_tree_lock(next); 4850 btrfs_tree_unlock(eb);
4603 btrfs_set_lock_blocking(next); 4851 path->locks[level] = 0;
4852 }
4853 return 0;
4854 }
4604 4855
4605 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, 4856 /* wc->stage == UPDATE_BACKREF */
4606 &refs, &flags); 4857 if (!(wc->flags[level] & flag)) {
4858 BUG_ON(!path->locks[level]);
4859 ret = btrfs_inc_ref(trans, root, eb, 1);
4607 BUG_ON(ret); 4860 BUG_ON(ret);
4608 if (refs > 1) { 4861 ret = btrfs_dec_ref(trans, root, eb, 0);
4609 parent = path->nodes[*level]; 4862 BUG_ON(ret);
4610 ret = btrfs_free_extent(trans, root, bytenr, 4863 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
4611 blocksize, parent->start, 4864 eb->len, flag, 0);
4612 btrfs_header_owner(parent), 4865 BUG_ON(ret);
4613 *level - 1, 0); 4866 wc->flags[level] |= flag;
4867 }
4868
4869 /*
4870 * the block is shared by multiple trees, so it's not good to
4871 * keep the tree lock
4872 */
4873 if (path->locks[level] && level > 0) {
4874 btrfs_tree_unlock(eb);
4875 path->locks[level] = 0;
4876 }
4877 return 0;
4878}
4879
4880/*
4881 * hepler to process tree block while walking up the tree.
4882 *
4883 * when wc->stage == DROP_REFERENCE, this function drops
4884 * reference count on the block.
4885 *
4886 * when wc->stage == UPDATE_BACKREF, this function changes
4887 * wc->stage back to DROP_REFERENCE if we changed wc->stage
4888 * to UPDATE_BACKREF previously while processing the block.
4889 *
4890 * NOTE: return value 1 means we should stop walking up.
4891 */
4892static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
4893 struct btrfs_root *root,
4894 struct btrfs_path *path,
4895 struct walk_control *wc)
4896{
4897 int ret = 0;
4898 int level = wc->level;
4899 struct extent_buffer *eb = path->nodes[level];
4900 u64 parent = 0;
4901
4902 if (wc->stage == UPDATE_BACKREF) {
4903 BUG_ON(wc->shared_level < level);
4904 if (level < wc->shared_level)
4905 goto out;
4906
4907 BUG_ON(wc->refs[level] <= 1);
4908 ret = find_next_key(path, level + 1, &wc->update_progress);
4909 if (ret > 0)
4910 wc->update_ref = 0;
4911
4912 wc->stage = DROP_REFERENCE;
4913 wc->shared_level = -1;
4914 path->slots[level] = 0;
4915
4916 /*
4917 * check reference count again if the block isn't locked.
4918 * we should start walking down the tree again if reference
4919 * count is one.
4920 */
4921 if (!path->locks[level]) {
4922 BUG_ON(level == 0);
4923 btrfs_tree_lock(eb);
4924 btrfs_set_lock_blocking(eb);
4925 path->locks[level] = 1;
4926
4927 ret = btrfs_lookup_extent_info(trans, root,
4928 eb->start, eb->len,
4929 &wc->refs[level],
4930 &wc->flags[level]);
4614 BUG_ON(ret); 4931 BUG_ON(ret);
4615 path->slots[*level]++; 4932 BUG_ON(wc->refs[level] == 0);
4616 btrfs_tree_unlock(next); 4933 if (wc->refs[level] == 1) {
4617 free_extent_buffer(next); 4934 btrfs_tree_unlock(eb);
4618 continue; 4935 path->locks[level] = 0;
4936 return 1;
4937 }
4938 } else {
4939 BUG_ON(level != 0);
4619 } 4940 }
4941 }
4620 4942
4621 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 4943 /* wc->stage == DROP_REFERENCE */
4944 BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
4622 4945
4623 *level = btrfs_header_level(next); 4946 if (wc->refs[level] == 1) {
4624 path->nodes[*level] = next; 4947 if (level == 0) {
4625 path->slots[*level] = 0; 4948 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
4626 path->locks[*level] = 1; 4949 ret = btrfs_dec_ref(trans, root, eb, 1);
4627 cond_resched(); 4950 else
4951 ret = btrfs_dec_ref(trans, root, eb, 0);
4952 BUG_ON(ret);
4953 }
4954 /* make block locked assertion in clean_tree_block happy */
4955 if (!path->locks[level] &&
4956 btrfs_header_generation(eb) == trans->transid) {
4957 btrfs_tree_lock(eb);
4958 btrfs_set_lock_blocking(eb);
4959 path->locks[level] = 1;
4960 }
4961 clean_tree_block(trans, root, eb);
4628 } 4962 }
4629out:
4630 if (path->nodes[*level] == root->node)
4631 parent = path->nodes[*level];
4632 else
4633 parent = path->nodes[*level + 1];
4634 bytenr = path->nodes[*level]->start;
4635 blocksize = path->nodes[*level]->len;
4636 4963
4637 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, 4964 if (eb == root->node) {
4638 btrfs_header_owner(parent), *level, 0); 4965 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
4966 parent = eb->start;
4967 else
4968 BUG_ON(root->root_key.objectid !=
4969 btrfs_header_owner(eb));
4970 } else {
4971 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
4972 parent = path->nodes[level + 1]->start;
4973 else
4974 BUG_ON(root->root_key.objectid !=
4975 btrfs_header_owner(path->nodes[level + 1]));
4976 }
4977
4978 ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
4979 root->root_key.objectid, level, 0);
4639 BUG_ON(ret); 4980 BUG_ON(ret);
4981out:
4982 wc->refs[level] = 0;
4983 wc->flags[level] = 0;
4984 return ret;
4985}
4986
4987static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4988 struct btrfs_root *root,
4989 struct btrfs_path *path,
4990 struct walk_control *wc)
4991{
4992 struct extent_buffer *next;
4993 struct extent_buffer *cur;
4994 u64 bytenr;
4995 u64 ptr_gen;
4996 u32 blocksize;
4997 int level = wc->level;
4998 int ret;
4999
5000 while (level >= 0) {
5001 cur = path->nodes[level];
5002 BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
5003
5004 ret = walk_down_proc(trans, root, path, wc);
5005 if (ret > 0)
5006 break;
5007
5008 if (level == 0)
5009 break;
5010
5011 bytenr = btrfs_node_blockptr(cur, path->slots[level]);
5012 blocksize = btrfs_level_size(root, level - 1);
5013 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
5014
5015 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
5016 btrfs_tree_lock(next);
5017 btrfs_set_lock_blocking(next);
4640 5018
4641 if (path->locks[*level]) { 5019 level--;
4642 btrfs_tree_unlock(path->nodes[*level]); 5020 BUG_ON(level != btrfs_header_level(next));
4643 path->locks[*level] = 0; 5021 path->nodes[level] = next;
5022 path->slots[level] = 0;
5023 path->locks[level] = 1;
5024 wc->level = level;
4644 } 5025 }
4645 free_extent_buffer(path->nodes[*level]);
4646 path->nodes[*level] = NULL;
4647 *level += 1;
4648 cond_resched();
4649 return 0; 5026 return 0;
4650} 5027}
4651 5028
4652/*
4653 * helper for dropping snapshots. This walks back up the tree in the path
4654 * to find the first node higher up where we haven't yet gone through
4655 * all the slots
4656 */
4657static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 5029static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
4658 struct btrfs_root *root, 5030 struct btrfs_root *root,
4659 struct btrfs_path *path, 5031 struct btrfs_path *path,
4660 int *level, int max_level) 5032 struct walk_control *wc, int max_level)
4661{ 5033{
4662 struct btrfs_root_item *root_item = &root->root_item; 5034 int level = wc->level;
4663 int i;
4664 int slot;
4665 int ret; 5035 int ret;
4666 5036
4667 for (i = *level; i < max_level && path->nodes[i]; i++) { 5037 path->slots[level] = btrfs_header_nritems(path->nodes[level]);
4668 slot = path->slots[i]; 5038 while (level < max_level && path->nodes[level]) {
4669 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 5039 wc->level = level;
4670 /* 5040 if (path->slots[level] + 1 <
4671 * there is more work to do in this level. 5041 btrfs_header_nritems(path->nodes[level])) {
4672 * Update the drop_progress marker to reflect 5042 path->slots[level]++;
4673 * the work we've done so far, and then bump
4674 * the slot number
4675 */
4676 path->slots[i]++;
4677 WARN_ON(*level == 0);
4678 if (max_level == BTRFS_MAX_LEVEL) {
4679 btrfs_node_key(path->nodes[i],
4680 &root_item->drop_progress,
4681 path->slots[i]);
4682 root_item->drop_level = i;
4683 }
4684 *level = i;
4685 return 0; 5043 return 0;
4686 } else { 5044 } else {
4687 struct extent_buffer *parent; 5045 ret = walk_up_proc(trans, root, path, wc);
4688 5046 if (ret > 0)
4689 /* 5047 return 0;
4690 * this whole node is done, free our reference
4691 * on it and go up one level
4692 */
4693 if (path->nodes[*level] == root->node)
4694 parent = path->nodes[*level];
4695 else
4696 parent = path->nodes[*level + 1];
4697 5048
4698 clean_tree_block(trans, root, path->nodes[i]); 5049 if (path->locks[level]) {
4699 ret = btrfs_free_extent(trans, root, 5050 btrfs_tree_unlock(path->nodes[level]);
4700 path->nodes[i]->start, 5051 path->locks[level] = 0;
4701 path->nodes[i]->len,
4702 parent->start,
4703 btrfs_header_owner(parent),
4704 *level, 0);
4705 BUG_ON(ret);
4706 if (path->locks[*level]) {
4707 btrfs_tree_unlock(path->nodes[i]);
4708 path->locks[i] = 0;
4709 } 5052 }
4710 free_extent_buffer(path->nodes[i]); 5053 free_extent_buffer(path->nodes[level]);
4711 path->nodes[i] = NULL; 5054 path->nodes[level] = NULL;
4712 *level = i + 1; 5055 level++;
4713 } 5056 }
4714 } 5057 }
4715 return 1; 5058 return 1;
4716} 5059}
4717 5060
4718/* 5061/*
4719 * drop the reference count on the tree rooted at 'snap'. This traverses 5062 * drop a subvolume tree.
4720 * the tree freeing any blocks that have a ref count of zero after being 5063 *
4721 * decremented. 5064 * this function traverses the tree freeing any blocks that only
5065 * referenced by the tree.
5066 *
5067 * when a shared tree block is found. this function decreases its
5068 * reference count by one. if update_ref is true, this function
5069 * also make sure backrefs for the shared block and all lower level
5070 * blocks are properly updated.
4722 */ 5071 */
4723int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root 5072int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
4724 *root)
4725{ 5073{
4726 int ret = 0;
4727 int wret;
4728 int level;
4729 struct btrfs_path *path; 5074 struct btrfs_path *path;
4730 int update_count; 5075 struct btrfs_trans_handle *trans;
5076 struct btrfs_root *tree_root = root->fs_info->tree_root;
4731 struct btrfs_root_item *root_item = &root->root_item; 5077 struct btrfs_root_item *root_item = &root->root_item;
5078 struct walk_control *wc;
5079 struct btrfs_key key;
5080 int err = 0;
5081 int ret;
5082 int level;
4732 5083
4733 path = btrfs_alloc_path(); 5084 path = btrfs_alloc_path();
4734 BUG_ON(!path); 5085 BUG_ON(!path);
4735 5086
4736 level = btrfs_header_level(root->node); 5087 wc = kzalloc(sizeof(*wc), GFP_NOFS);
5088 BUG_ON(!wc);
5089
5090 trans = btrfs_start_transaction(tree_root, 1);
5091
4737 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5092 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5093 level = btrfs_header_level(root->node);
4738 path->nodes[level] = btrfs_lock_root_node(root); 5094 path->nodes[level] = btrfs_lock_root_node(root);
4739 btrfs_set_lock_blocking(path->nodes[level]); 5095 btrfs_set_lock_blocking(path->nodes[level]);
4740 path->slots[level] = 0; 5096 path->slots[level] = 0;
4741 path->locks[level] = 1; 5097 path->locks[level] = 1;
5098 memset(&wc->update_progress, 0,
5099 sizeof(wc->update_progress));
4742 } else { 5100 } else {
4743 struct btrfs_key key;
4744 struct btrfs_disk_key found_key;
4745 struct extent_buffer *node;
4746
4747 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 5101 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5102 memcpy(&wc->update_progress, &key,
5103 sizeof(wc->update_progress));
5104
4748 level = root_item->drop_level; 5105 level = root_item->drop_level;
5106 BUG_ON(level == 0);
4749 path->lowest_level = level; 5107 path->lowest_level = level;
4750 wret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5108 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4751 if (wret < 0) { 5109 path->lowest_level = 0;
4752 ret = wret; 5110 if (ret < 0) {
5111 err = ret;
4753 goto out; 5112 goto out;
4754 } 5113 }
4755 node = path->nodes[level]; 5114 btrfs_node_key_to_cpu(path->nodes[level], &key,
4756 btrfs_node_key(node, &found_key, path->slots[level]); 5115 path->slots[level]);
4757 WARN_ON(memcmp(&found_key, &root_item->drop_progress, 5116 WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
4758 sizeof(found_key))); 5117
4759 /* 5118 /*
4760 * unlock our path, this is safe because only this 5119 * unlock our path, this is safe because only this
4761 * function is allowed to delete this snapshot 5120 * function is allowed to delete this snapshot
4762 */ 5121 */
4763 btrfs_unlock_up_safe(path, 0); 5122 btrfs_unlock_up_safe(path, 0);
5123
5124 level = btrfs_header_level(root->node);
5125 while (1) {
5126 btrfs_tree_lock(path->nodes[level]);
5127 btrfs_set_lock_blocking(path->nodes[level]);
5128
5129 ret = btrfs_lookup_extent_info(trans, root,
5130 path->nodes[level]->start,
5131 path->nodes[level]->len,
5132 &wc->refs[level],
5133 &wc->flags[level]);
5134 BUG_ON(ret);
5135 BUG_ON(wc->refs[level] == 0);
5136
5137 if (level == root_item->drop_level)
5138 break;
5139
5140 btrfs_tree_unlock(path->nodes[level]);
5141 WARN_ON(wc->refs[level] != 1);
5142 level--;
5143 }
4764 } 5144 }
5145
5146 wc->level = level;
5147 wc->shared_level = -1;
5148 wc->stage = DROP_REFERENCE;
5149 wc->update_ref = update_ref;
5150 wc->keep_locks = 0;
5151
4765 while (1) { 5152 while (1) {
4766 unsigned long update; 5153 ret = walk_down_tree(trans, root, path, wc);
4767 wret = walk_down_tree(trans, root, path, &level); 5154 if (ret < 0) {
4768 if (wret > 0) 5155 err = ret;
4769 break; 5156 break;
4770 if (wret < 0) 5157 }
4771 ret = wret;
4772 5158
4773 wret = walk_up_tree(trans, root, path, &level, 5159 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
4774 BTRFS_MAX_LEVEL); 5160 if (ret < 0) {
4775 if (wret > 0) 5161 err = ret;
4776 break; 5162 break;
4777 if (wret < 0) 5163 }
4778 ret = wret; 5164
4779 if (trans->transaction->in_commit || 5165 if (ret > 0) {
4780 trans->transaction->delayed_refs.flushing) { 5166 BUG_ON(wc->stage != DROP_REFERENCE);
4781 ret = -EAGAIN;
4782 break; 5167 break;
4783 } 5168 }
4784 for (update_count = 0; update_count < 16; update_count++) { 5169
5170 if (wc->stage == DROP_REFERENCE) {
5171 level = wc->level;
5172 btrfs_node_key(path->nodes[level],
5173 &root_item->drop_progress,
5174 path->slots[level]);
5175 root_item->drop_level = level;
5176 }
5177
5178 BUG_ON(wc->level == 0);
5179 if (trans->transaction->in_commit ||
5180 trans->transaction->delayed_refs.flushing) {
5181 ret = btrfs_update_root(trans, tree_root,
5182 &root->root_key,
5183 root_item);
5184 BUG_ON(ret);
5185
5186 btrfs_end_transaction(trans, tree_root);
5187 trans = btrfs_start_transaction(tree_root, 1);
5188 } else {
5189 unsigned long update;
4785 update = trans->delayed_ref_updates; 5190 update = trans->delayed_ref_updates;
4786 trans->delayed_ref_updates = 0; 5191 trans->delayed_ref_updates = 0;
4787 if (update) 5192 if (update)
4788 btrfs_run_delayed_refs(trans, root, update); 5193 btrfs_run_delayed_refs(trans, tree_root,
4789 else 5194 update);
4790 break;
4791 } 5195 }
4792 } 5196 }
5197 btrfs_release_path(root, path);
5198 BUG_ON(err);
5199
5200 ret = btrfs_del_root(trans, tree_root, &root->root_key);
5201 BUG_ON(ret);
5202
5203 free_extent_buffer(root->node);
5204 free_extent_buffer(root->commit_root);
5205 kfree(root);
4793out: 5206out:
5207 btrfs_end_transaction(trans, tree_root);
5208 kfree(wc);
4794 btrfs_free_path(path); 5209 btrfs_free_path(path);
4795 return ret; 5210 return err;
4796} 5211}
4797 5212
5213/*
5214 * drop subtree rooted at tree block 'node'.
5215 *
5216 * NOTE: this function will unlock and release tree block 'node'
5217 */
4798int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 5218int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
4799 struct btrfs_root *root, 5219 struct btrfs_root *root,
4800 struct extent_buffer *node, 5220 struct extent_buffer *node,
4801 struct extent_buffer *parent) 5221 struct extent_buffer *parent)
4802{ 5222{
4803 struct btrfs_path *path; 5223 struct btrfs_path *path;
5224 struct walk_control *wc;
4804 int level; 5225 int level;
4805 int parent_level; 5226 int parent_level;
4806 int ret = 0; 5227 int ret = 0;
4807 int wret; 5228 int wret;
4808 5229
5230 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5231
4809 path = btrfs_alloc_path(); 5232 path = btrfs_alloc_path();
4810 BUG_ON(!path); 5233 BUG_ON(!path);
4811 5234
5235 wc = kzalloc(sizeof(*wc), GFP_NOFS);
5236 BUG_ON(!wc);
5237
4812 btrfs_assert_tree_locked(parent); 5238 btrfs_assert_tree_locked(parent);
4813 parent_level = btrfs_header_level(parent); 5239 parent_level = btrfs_header_level(parent);
4814 extent_buffer_get(parent); 5240 extent_buffer_get(parent);
@@ -4817,24 +5243,33 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
4817 5243
4818 btrfs_assert_tree_locked(node); 5244 btrfs_assert_tree_locked(node);
4819 level = btrfs_header_level(node); 5245 level = btrfs_header_level(node);
4820 extent_buffer_get(node);
4821 path->nodes[level] = node; 5246 path->nodes[level] = node;
4822 path->slots[level] = 0; 5247 path->slots[level] = 0;
5248 path->locks[level] = 1;
5249
5250 wc->refs[parent_level] = 1;
5251 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5252 wc->level = level;
5253 wc->shared_level = -1;
5254 wc->stage = DROP_REFERENCE;
5255 wc->update_ref = 0;
5256 wc->keep_locks = 1;
4823 5257
4824 while (1) { 5258 while (1) {
4825 wret = walk_down_tree(trans, root, path, &level); 5259 wret = walk_down_tree(trans, root, path, wc);
4826 if (wret < 0) 5260 if (wret < 0) {
4827 ret = wret; 5261 ret = wret;
4828 if (wret != 0)
4829 break; 5262 break;
5263 }
4830 5264
4831 wret = walk_up_tree(trans, root, path, &level, parent_level); 5265 wret = walk_up_tree(trans, root, path, wc, parent_level);
4832 if (wret < 0) 5266 if (wret < 0)
4833 ret = wret; 5267 ret = wret;
4834 if (wret != 0) 5268 if (wret != 0)
4835 break; 5269 break;
4836 } 5270 }
4837 5271
5272 kfree(wc);
4838 btrfs_free_path(path); 5273 btrfs_free_path(path);
4839 return ret; 5274 return ret;
4840} 5275}
@@ -6739,11 +7174,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6739 &info->block_group_cache_tree); 7174 &info->block_group_cache_tree);
6740 spin_unlock(&info->block_group_cache_lock); 7175 spin_unlock(&info->block_group_cache_lock);
6741 7176
6742 btrfs_remove_free_space_cache(block_group);
6743 down_write(&block_group->space_info->groups_sem); 7177 down_write(&block_group->space_info->groups_sem);
6744 list_del(&block_group->list); 7178 list_del(&block_group->list);
6745 up_write(&block_group->space_info->groups_sem); 7179 up_write(&block_group->space_info->groups_sem);
6746 7180
7181 if (block_group->cached == BTRFS_CACHE_STARTED)
7182 wait_event(block_group->caching_q,
7183 block_group_cache_done(block_group));
7184
7185 btrfs_remove_free_space_cache(block_group);
7186
6747 WARN_ON(atomic_read(&block_group->count) != 1); 7187 WARN_ON(atomic_read(&block_group->count) != 1);
6748 kfree(block_group); 7188 kfree(block_group);
6749 7189
@@ -6809,9 +7249,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
6809 atomic_set(&cache->count, 1); 7249 atomic_set(&cache->count, 1);
6810 spin_lock_init(&cache->lock); 7250 spin_lock_init(&cache->lock);
6811 spin_lock_init(&cache->tree_lock); 7251 spin_lock_init(&cache->tree_lock);
6812 mutex_init(&cache->cache_mutex); 7252 cache->fs_info = info;
7253 init_waitqueue_head(&cache->caching_q);
6813 INIT_LIST_HEAD(&cache->list); 7254 INIT_LIST_HEAD(&cache->list);
6814 INIT_LIST_HEAD(&cache->cluster_list); 7255 INIT_LIST_HEAD(&cache->cluster_list);
7256
7257 /*
7258 * we only want to have 32k of ram per block group for keeping
7259 * track of free space, and if we pass 1/2 of that we want to
7260 * start converting things over to using bitmaps
7261 */
7262 cache->extents_thresh = ((1024 * 32) / 2) /
7263 sizeof(struct btrfs_free_space);
7264
6815 read_extent_buffer(leaf, &cache->item, 7265 read_extent_buffer(leaf, &cache->item,
6816 btrfs_item_ptr_offset(leaf, path->slots[0]), 7266 btrfs_item_ptr_offset(leaf, path->slots[0]),
6817 sizeof(cache->item)); 7267 sizeof(cache->item));
@@ -6820,6 +7270,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
6820 key.objectid = found_key.objectid + found_key.offset; 7270 key.objectid = found_key.objectid + found_key.offset;
6821 btrfs_release_path(root, path); 7271 btrfs_release_path(root, path);
6822 cache->flags = btrfs_block_group_flags(&cache->item); 7272 cache->flags = btrfs_block_group_flags(&cache->item);
7273 cache->sectorsize = root->sectorsize;
7274
7275 remove_sb_from_cache(root, cache);
7276
7277 /*
7278 * check for two cases, either we are full, and therefore
7279 * don't need to bother with the caching work since we won't
7280 * find any space, or we are empty, and we can just add all
7281 * the space in and be done with it. This saves us _alot_ of
7282 * time, particularly in the full case.
7283 */
7284 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7285 cache->cached = BTRFS_CACHE_FINISHED;
7286 } else if (btrfs_block_group_used(&cache->item) == 0) {
7287 cache->cached = BTRFS_CACHE_FINISHED;
7288 add_new_free_space(cache, root->fs_info,
7289 found_key.objectid,
7290 found_key.objectid +
7291 found_key.offset);
7292 }
6823 7293
6824 ret = update_space_info(info, cache->flags, found_key.offset, 7294 ret = update_space_info(info, cache->flags, found_key.offset,
6825 btrfs_block_group_used(&cache->item), 7295 btrfs_block_group_used(&cache->item),
@@ -6863,10 +7333,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6863 cache->key.objectid = chunk_offset; 7333 cache->key.objectid = chunk_offset;
6864 cache->key.offset = size; 7334 cache->key.offset = size;
6865 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 7335 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7336 cache->sectorsize = root->sectorsize;
7337
7338 /*
7339 * we only want to have 32k of ram per block group for keeping track
7340 * of free space, and if we pass 1/2 of that we want to start
7341 * converting things over to using bitmaps
7342 */
7343 cache->extents_thresh = ((1024 * 32) / 2) /
7344 sizeof(struct btrfs_free_space);
6866 atomic_set(&cache->count, 1); 7345 atomic_set(&cache->count, 1);
6867 spin_lock_init(&cache->lock); 7346 spin_lock_init(&cache->lock);
6868 spin_lock_init(&cache->tree_lock); 7347 spin_lock_init(&cache->tree_lock);
6869 mutex_init(&cache->cache_mutex); 7348 init_waitqueue_head(&cache->caching_q);
6870 INIT_LIST_HEAD(&cache->list); 7349 INIT_LIST_HEAD(&cache->list);
6871 INIT_LIST_HEAD(&cache->cluster_list); 7350 INIT_LIST_HEAD(&cache->cluster_list);
6872 7351
@@ -6875,6 +7354,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6875 cache->flags = type; 7354 cache->flags = type;
6876 btrfs_set_block_group_flags(&cache->item, type); 7355 btrfs_set_block_group_flags(&cache->item, type);
6877 7356
7357 cache->cached = BTRFS_CACHE_FINISHED;
7358 remove_sb_from_cache(root, cache);
7359
7360 add_new_free_space(cache, root->fs_info, chunk_offset,
7361 chunk_offset + size);
7362
6878 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7363 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
6879 &cache->space_info); 7364 &cache->space_info);
6880 BUG_ON(ret); 7365 BUG_ON(ret);
@@ -6933,7 +7418,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
6933 rb_erase(&block_group->cache_node, 7418 rb_erase(&block_group->cache_node,
6934 &root->fs_info->block_group_cache_tree); 7419 &root->fs_info->block_group_cache_tree);
6935 spin_unlock(&root->fs_info->block_group_cache_lock); 7420 spin_unlock(&root->fs_info->block_group_cache_lock);
6936 btrfs_remove_free_space_cache(block_group); 7421
6937 down_write(&block_group->space_info->groups_sem); 7422 down_write(&block_group->space_info->groups_sem);
6938 /* 7423 /*
6939 * we must use list_del_init so people can check to see if they 7424 * we must use list_del_init so people can check to see if they
@@ -6942,11 +7427,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
6942 list_del_init(&block_group->list); 7427 list_del_init(&block_group->list);
6943 up_write(&block_group->space_info->groups_sem); 7428 up_write(&block_group->space_info->groups_sem);
6944 7429
7430 if (block_group->cached == BTRFS_CACHE_STARTED)
7431 wait_event(block_group->caching_q,
7432 block_group_cache_done(block_group));
7433
7434 btrfs_remove_free_space_cache(block_group);
7435
6945 spin_lock(&block_group->space_info->lock); 7436 spin_lock(&block_group->space_info->lock);
6946 block_group->space_info->total_bytes -= block_group->key.offset; 7437 block_group->space_info->total_bytes -= block_group->key.offset;
6947 block_group->space_info->bytes_readonly -= block_group->key.offset; 7438 block_group->space_info->bytes_readonly -= block_group->key.offset;
6948 spin_unlock(&block_group->space_info->lock); 7439 spin_unlock(&block_group->space_info->lock);
6949 block_group->space_info->full = 0; 7440
7441 btrfs_clear_space_info_full(root->fs_info);
6950 7442
6951 btrfs_put_block_group(block_group); 7443 btrfs_put_block_group(block_group);
6952 btrfs_put_block_group(block_group); 7444 btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 126477eaecf5..4b833972273a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -22,7 +22,6 @@
22#include <linux/time.h> 22#include <linux/time.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/string.h> 24#include <linux/string.h>
25#include <linux/smp_lock.h>
26#include <linux/backing-dev.h> 25#include <linux/backing-dev.h>
27#include <linux/mpage.h> 26#include <linux/mpage.h>
28#include <linux/swap.h> 27#include <linux/swap.h>
@@ -151,7 +150,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
151 } 150 }
152 if (end_pos > isize) { 151 if (end_pos > isize) {
153 i_size_write(inode, end_pos); 152 i_size_write(inode, end_pos);
154 btrfs_update_inode(trans, root, inode); 153 /* we've only changed i_size in ram, and we haven't updated
154 * the disk i_size. There is no need to log the inode
155 * at this time.
156 */
155 } 157 }
156 err = btrfs_end_transaction(trans, root); 158 err = btrfs_end_transaction(trans, root);
157out_unlock: 159out_unlock:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4538e48581a5..5edcee3a617f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,45 +16,46 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/pagemap.h>
19#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/math64.h>
20#include "ctree.h" 22#include "ctree.h"
21#include "free-space-cache.h" 23#include "free-space-cache.h"
22#include "transaction.h" 24#include "transaction.h"
23 25
24struct btrfs_free_space { 26#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
25 struct rb_node bytes_index; 27#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
26 struct rb_node offset_index;
27 u64 offset;
28 u64 bytes;
29};
30 28
31static int tree_insert_offset(struct rb_root *root, u64 offset, 29static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
32 struct rb_node *node) 30 u64 offset)
33{ 31{
34 struct rb_node **p = &root->rb_node; 32 BUG_ON(offset < bitmap_start);
35 struct rb_node *parent = NULL; 33 offset -= bitmap_start;
36 struct btrfs_free_space *info; 34 return (unsigned long)(div64_u64(offset, sectorsize));
35}
37 36
38 while (*p) { 37static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
39 parent = *p; 38{
40 info = rb_entry(parent, struct btrfs_free_space, offset_index); 39 return (unsigned long)(div64_u64(bytes, sectorsize));
40}
41 41
42 if (offset < info->offset) 42static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
43 p = &(*p)->rb_left; 43 u64 offset)
44 else if (offset > info->offset) 44{
45 p = &(*p)->rb_right; 45 u64 bitmap_start;
46 else 46 u64 bytes_per_bitmap;
47 return -EEXIST;
48 }
49 47
50 rb_link_node(node, parent, p); 48 bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
51 rb_insert_color(node, root); 49 bitmap_start = offset - block_group->key.objectid;
50 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
51 bitmap_start *= bytes_per_bitmap;
52 bitmap_start += block_group->key.objectid;
52 53
53 return 0; 54 return bitmap_start;
54} 55}
55 56
56static int tree_insert_bytes(struct rb_root *root, u64 bytes, 57static int tree_insert_offset(struct rb_root *root, u64 offset,
57 struct rb_node *node) 58 struct rb_node *node, int bitmap)
58{ 59{
59 struct rb_node **p = &root->rb_node; 60 struct rb_node **p = &root->rb_node;
60 struct rb_node *parent = NULL; 61 struct rb_node *parent = NULL;
@@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
62 63
63 while (*p) { 64 while (*p) {
64 parent = *p; 65 parent = *p;
65 info = rb_entry(parent, struct btrfs_free_space, bytes_index); 66 info = rb_entry(parent, struct btrfs_free_space, offset_index);
66 67
67 if (bytes < info->bytes) 68 if (offset < info->offset) {
68 p = &(*p)->rb_left; 69 p = &(*p)->rb_left;
69 else 70 } else if (offset > info->offset) {
70 p = &(*p)->rb_right; 71 p = &(*p)->rb_right;
72 } else {
73 /*
74 * we could have a bitmap entry and an extent entry
75 * share the same offset. If this is the case, we want
76 * the extent entry to always be found first if we do a
77 * linear search through the tree, since we want to have
78 * the quickest allocation time, and allocating from an
79 * extent is faster than allocating from a bitmap. So
80 * if we're inserting a bitmap and we find an entry at
81 * this offset, we want to go right, or after this entry
82 * logically. If we are inserting an extent and we've
83 * found a bitmap, we want to go left, or before
84 * logically.
85 */
86 if (bitmap) {
87 WARN_ON(info->bitmap);
88 p = &(*p)->rb_right;
89 } else {
90 WARN_ON(!info->bitmap);
91 p = &(*p)->rb_left;
92 }
93 }
71 } 94 }
72 95
73 rb_link_node(node, parent, p); 96 rb_link_node(node, parent, p);
@@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
79/* 102/*
80 * searches the tree for the given offset. 103 * searches the tree for the given offset.
81 * 104 *
82 * fuzzy == 1: this is used for allocations where we are given a hint of where 105 * fuzzy - If this is set, then we are trying to make an allocation, and we just
83 * to look for free space. Because the hint may not be completely on an offset 106 * want a section that has at least bytes size and comes at or after the given
84 * mark, or the hint may no longer point to free space we need to fudge our 107 * offset.
85 * results a bit. So we look for free space starting at or after offset with at
86 * least bytes size. We prefer to find as close to the given offset as we can.
87 * Also if the offset is within a free space range, then we will return the free
88 * space that contains the given offset, which means we can return a free space
89 * chunk with an offset before the provided offset.
90 *
91 * fuzzy == 0: this is just a normal tree search. Give us the free space that
92 * starts at the given offset which is at least bytes size, and if its not there
93 * return NULL.
94 */ 108 */
95static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 109static struct btrfs_free_space *
96 u64 offset, u64 bytes, 110tree_search_offset(struct btrfs_block_group_cache *block_group,
97 int fuzzy) 111 u64 offset, int bitmap_only, int fuzzy)
98{ 112{
99 struct rb_node *n = root->rb_node; 113 struct rb_node *n = block_group->free_space_offset.rb_node;
100 struct btrfs_free_space *entry, *ret = NULL; 114 struct btrfs_free_space *entry, *prev = NULL;
115
116 /* find entry that is closest to the 'offset' */
117 while (1) {
118 if (!n) {
119 entry = NULL;
120 break;
121 }
101 122
102 while (n) {
103 entry = rb_entry(n, struct btrfs_free_space, offset_index); 123 entry = rb_entry(n, struct btrfs_free_space, offset_index);
124 prev = entry;
104 125
105 if (offset < entry->offset) { 126 if (offset < entry->offset)
106 if (fuzzy &&
107 (!ret || entry->offset < ret->offset) &&
108 (bytes <= entry->bytes))
109 ret = entry;
110 n = n->rb_left; 127 n = n->rb_left;
111 } else if (offset > entry->offset) { 128 else if (offset > entry->offset)
112 if (fuzzy &&
113 (entry->offset + entry->bytes - 1) >= offset &&
114 bytes <= entry->bytes) {
115 ret = entry;
116 break;
117 }
118 n = n->rb_right; 129 n = n->rb_right;
119 } else { 130 else
120 if (bytes > entry->bytes) {
121 n = n->rb_right;
122 continue;
123 }
124 ret = entry;
125 break; 131 break;
126 }
127 } 132 }
128 133
129 return ret; 134 if (bitmap_only) {
130} 135 if (!entry)
136 return NULL;
137 if (entry->bitmap)
138 return entry;
131 139
132/* 140 /*
133 * return a chunk at least bytes size, as close to offset that we can get. 141 * bitmap entry and extent entry may share same offset,
134 */ 142 * in that case, bitmap entry comes after extent entry.
135static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, 143 */
136 u64 offset, u64 bytes) 144 n = rb_next(n);
137{ 145 if (!n)
138 struct rb_node *n = root->rb_node; 146 return NULL;
139 struct btrfs_free_space *entry, *ret = NULL; 147 entry = rb_entry(n, struct btrfs_free_space, offset_index);
140 148 if (entry->offset != offset)
141 while (n) { 149 return NULL;
142 entry = rb_entry(n, struct btrfs_free_space, bytes_index);
143 150
144 if (bytes < entry->bytes) { 151 WARN_ON(!entry->bitmap);
152 return entry;
153 } else if (entry) {
154 if (entry->bitmap) {
145 /* 155 /*
146 * We prefer to get a hole size as close to the size we 156 * if previous extent entry covers the offset,
147 * are asking for so we don't take small slivers out of 157 * we should return it instead of the bitmap entry
148 * huge holes, but we also want to get as close to the
149 * offset as possible so we don't have a whole lot of
150 * fragmentation.
151 */ 158 */
152 if (offset <= entry->offset) { 159 n = &entry->offset_index;
153 if (!ret) 160 while (1) {
154 ret = entry; 161 n = rb_prev(n);
155 else if (entry->bytes < ret->bytes) 162 if (!n)
156 ret = entry; 163 break;
157 else if (entry->offset < ret->offset) 164 prev = rb_entry(n, struct btrfs_free_space,
158 ret = entry; 165 offset_index);
166 if (!prev->bitmap) {
167 if (prev->offset + prev->bytes > offset)
168 entry = prev;
169 break;
170 }
159 } 171 }
160 n = n->rb_left; 172 }
161 } else if (bytes > entry->bytes) { 173 return entry;
162 n = n->rb_right; 174 }
175
176 if (!prev)
177 return NULL;
178
179 /* find last entry before the 'offset' */
180 entry = prev;
181 if (entry->offset > offset) {
182 n = rb_prev(&entry->offset_index);
183 if (n) {
184 entry = rb_entry(n, struct btrfs_free_space,
185 offset_index);
186 BUG_ON(entry->offset > offset);
163 } else { 187 } else {
164 /* 188 if (fuzzy)
165 * Ok we may have multiple chunks of the wanted size, 189 return entry;
166 * so we don't want to take the first one we find, we 190 else
167 * want to take the one closest to our given offset, so 191 return NULL;
168 * keep searching just in case theres a better match.
169 */
170 n = n->rb_right;
171 if (offset > entry->offset)
172 continue;
173 else if (!ret || entry->offset < ret->offset)
174 ret = entry;
175 } 192 }
176 } 193 }
177 194
178 return ret; 195 if (entry->bitmap) {
196 n = &entry->offset_index;
197 while (1) {
198 n = rb_prev(n);
199 if (!n)
200 break;
201 prev = rb_entry(n, struct btrfs_free_space,
202 offset_index);
203 if (!prev->bitmap) {
204 if (prev->offset + prev->bytes > offset)
205 return prev;
206 break;
207 }
208 }
209 if (entry->offset + BITS_PER_BITMAP *
210 block_group->sectorsize > offset)
211 return entry;
212 } else if (entry->offset + entry->bytes > offset)
213 return entry;
214
215 if (!fuzzy)
216 return NULL;
217
218 while (1) {
219 if (entry->bitmap) {
220 if (entry->offset + BITS_PER_BITMAP *
221 block_group->sectorsize > offset)
222 break;
223 } else {
224 if (entry->offset + entry->bytes > offset)
225 break;
226 }
227
228 n = rb_next(&entry->offset_index);
229 if (!n)
230 return NULL;
231 entry = rb_entry(n, struct btrfs_free_space, offset_index);
232 }
233 return entry;
179} 234}
180 235
181static void unlink_free_space(struct btrfs_block_group_cache *block_group, 236static void unlink_free_space(struct btrfs_block_group_cache *block_group,
182 struct btrfs_free_space *info) 237 struct btrfs_free_space *info)
183{ 238{
184 rb_erase(&info->offset_index, &block_group->free_space_offset); 239 rb_erase(&info->offset_index, &block_group->free_space_offset);
185 rb_erase(&info->bytes_index, &block_group->free_space_bytes); 240 block_group->free_extents--;
241 block_group->free_space -= info->bytes;
186} 242}
187 243
188static int link_free_space(struct btrfs_block_group_cache *block_group, 244static int link_free_space(struct btrfs_block_group_cache *block_group,
@@ -190,17 +246,353 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
190{ 246{
191 int ret = 0; 247 int ret = 0;
192 248
193 249 BUG_ON(!info->bitmap && !info->bytes);
194 BUG_ON(!info->bytes);
195 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 250 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
196 &info->offset_index); 251 &info->offset_index, (info->bitmap != NULL));
197 if (ret) 252 if (ret)
198 return ret; 253 return ret;
199 254
200 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, 255 block_group->free_space += info->bytes;
201 &info->bytes_index); 256 block_group->free_extents++;
202 if (ret) 257 return ret;
203 return ret; 258}
259
260static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
261{
262 u64 max_bytes, possible_bytes;
263
264 /*
265 * The goal is to keep the total amount of memory used per 1gb of space
266 * at or below 32k, so we need to adjust how much memory we allow to be
267 * used by extent based free space tracking
268 */
269 max_bytes = MAX_CACHE_BYTES_PER_GIG *
270 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
271
272 possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
273 (sizeof(struct btrfs_free_space) *
274 block_group->extents_thresh);
275
276 if (possible_bytes > max_bytes) {
277 int extent_bytes = max_bytes -
278 (block_group->total_bitmaps * PAGE_CACHE_SIZE);
279
280 if (extent_bytes <= 0) {
281 block_group->extents_thresh = 0;
282 return;
283 }
284
285 block_group->extents_thresh = extent_bytes /
286 (sizeof(struct btrfs_free_space));
287 }
288}
289
290static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
291 struct btrfs_free_space *info, u64 offset,
292 u64 bytes)
293{
294 unsigned long start, end;
295 unsigned long i;
296
297 start = offset_to_bit(info->offset, block_group->sectorsize, offset);
298 end = start + bytes_to_bits(bytes, block_group->sectorsize);
299 BUG_ON(end > BITS_PER_BITMAP);
300
301 for (i = start; i < end; i++)
302 clear_bit(i, info->bitmap);
303
304 info->bytes -= bytes;
305 block_group->free_space -= bytes;
306}
307
308static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
309 struct btrfs_free_space *info, u64 offset,
310 u64 bytes)
311{
312 unsigned long start, end;
313 unsigned long i;
314
315 start = offset_to_bit(info->offset, block_group->sectorsize, offset);
316 end = start + bytes_to_bits(bytes, block_group->sectorsize);
317 BUG_ON(end > BITS_PER_BITMAP);
318
319 for (i = start; i < end; i++)
320 set_bit(i, info->bitmap);
321
322 info->bytes += bytes;
323 block_group->free_space += bytes;
324}
325
326static int search_bitmap(struct btrfs_block_group_cache *block_group,
327 struct btrfs_free_space *bitmap_info, u64 *offset,
328 u64 *bytes)
329{
330 unsigned long found_bits = 0;
331 unsigned long bits, i;
332 unsigned long next_zero;
333
334 i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
335 max_t(u64, *offset, bitmap_info->offset));
336 bits = bytes_to_bits(*bytes, block_group->sectorsize);
337
338 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
339 i < BITS_PER_BITMAP;
340 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
341 next_zero = find_next_zero_bit(bitmap_info->bitmap,
342 BITS_PER_BITMAP, i);
343 if ((next_zero - i) >= bits) {
344 found_bits = next_zero - i;
345 break;
346 }
347 i = next_zero;
348 }
349
350 if (found_bits) {
351 *offset = (u64)(i * block_group->sectorsize) +
352 bitmap_info->offset;
353 *bytes = (u64)(found_bits) * block_group->sectorsize;
354 return 0;
355 }
356
357 return -1;
358}
359
360static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
361 *block_group, u64 *offset,
362 u64 *bytes, int debug)
363{
364 struct btrfs_free_space *entry;
365 struct rb_node *node;
366 int ret;
367
368 if (!block_group->free_space_offset.rb_node)
369 return NULL;
370
371 entry = tree_search_offset(block_group,
372 offset_to_bitmap(block_group, *offset),
373 0, 1);
374 if (!entry)
375 return NULL;
376
377 for (node = &entry->offset_index; node; node = rb_next(node)) {
378 entry = rb_entry(node, struct btrfs_free_space, offset_index);
379 if (entry->bytes < *bytes)
380 continue;
381
382 if (entry->bitmap) {
383 ret = search_bitmap(block_group, entry, offset, bytes);
384 if (!ret)
385 return entry;
386 continue;
387 }
388
389 *offset = entry->offset;
390 *bytes = entry->bytes;
391 return entry;
392 }
393
394 return NULL;
395}
396
397static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
398 struct btrfs_free_space *info, u64 offset)
399{
400 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
401 int max_bitmaps = (int)div64_u64(block_group->key.offset +
402 bytes_per_bg - 1, bytes_per_bg);
403 BUG_ON(block_group->total_bitmaps >= max_bitmaps);
404
405 info->offset = offset_to_bitmap(block_group, offset);
406 link_free_space(block_group, info);
407 block_group->total_bitmaps++;
408
409 recalculate_thresholds(block_group);
410}
411
412static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
413 struct btrfs_free_space *bitmap_info,
414 u64 *offset, u64 *bytes)
415{
416 u64 end;
417 u64 search_start, search_bytes;
418 int ret;
419
420again:
421 end = bitmap_info->offset +
422 (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
423
424 /*
425 * XXX - this can go away after a few releases.
426 *
427 * since the only user of btrfs_remove_free_space is the tree logging
428 * stuff, and the only way to test that is under crash conditions, we
429 * want to have this debug stuff here just in case somethings not
430 * working. Search the bitmap for the space we are trying to use to
431 * make sure its actually there. If its not there then we need to stop
432 * because something has gone wrong.
433 */
434 search_start = *offset;
435 search_bytes = *bytes;
436 ret = search_bitmap(block_group, bitmap_info, &search_start,
437 &search_bytes);
438 BUG_ON(ret < 0 || search_start != *offset);
439
440 if (*offset > bitmap_info->offset && *offset + *bytes > end) {
441 bitmap_clear_bits(block_group, bitmap_info, *offset,
442 end - *offset + 1);
443 *bytes -= end - *offset + 1;
444 *offset = end + 1;
445 } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
446 bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
447 *bytes = 0;
448 }
449
450 if (*bytes) {
451 struct rb_node *next = rb_next(&bitmap_info->offset_index);
452 if (!bitmap_info->bytes) {
453 unlink_free_space(block_group, bitmap_info);
454 kfree(bitmap_info->bitmap);
455 kfree(bitmap_info);
456 block_group->total_bitmaps--;
457 recalculate_thresholds(block_group);
458 }
459
460 /*
461 * no entry after this bitmap, but we still have bytes to
462 * remove, so something has gone wrong.
463 */
464 if (!next)
465 return -EINVAL;
466
467 bitmap_info = rb_entry(next, struct btrfs_free_space,
468 offset_index);
469
470 /*
471 * if the next entry isn't a bitmap we need to return to let the
472 * extent stuff do its work.
473 */
474 if (!bitmap_info->bitmap)
475 return -EAGAIN;
476
477 /*
478 * Ok the next item is a bitmap, but it may not actually hold
479 * the information for the rest of this free space stuff, so
480 * look for it, and if we don't find it return so we can try
481 * everything over again.
482 */
483 search_start = *offset;
484 search_bytes = *bytes;
485 ret = search_bitmap(block_group, bitmap_info, &search_start,
486 &search_bytes);
487 if (ret < 0 || search_start != *offset)
488 return -EAGAIN;
489
490 goto again;
491 } else if (!bitmap_info->bytes) {
492 unlink_free_space(block_group, bitmap_info);
493 kfree(bitmap_info->bitmap);
494 kfree(bitmap_info);
495 block_group->total_bitmaps--;
496 recalculate_thresholds(block_group);
497 }
498
499 return 0;
500}
501
502static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
503 struct btrfs_free_space *info)
504{
505 struct btrfs_free_space *bitmap_info;
506 int added = 0;
507 u64 bytes, offset, end;
508 int ret;
509
510 /*
511 * If we are below the extents threshold then we can add this as an
512 * extent, and don't have to deal with the bitmap
513 */
514 if (block_group->free_extents < block_group->extents_thresh &&
515 info->bytes > block_group->sectorsize * 4)
516 return 0;
517
518 /*
519 * some block groups are so tiny they can't be enveloped by a bitmap, so
520 * don't even bother to create a bitmap for this
521 */
522 if (BITS_PER_BITMAP * block_group->sectorsize >
523 block_group->key.offset)
524 return 0;
525
526 bytes = info->bytes;
527 offset = info->offset;
528
529again:
530 bitmap_info = tree_search_offset(block_group,
531 offset_to_bitmap(block_group, offset),
532 1, 0);
533 if (!bitmap_info) {
534 BUG_ON(added);
535 goto new_bitmap;
536 }
537
538 end = bitmap_info->offset +
539 (u64)(BITS_PER_BITMAP * block_group->sectorsize);
540
541 if (offset >= bitmap_info->offset && offset + bytes > end) {
542 bitmap_set_bits(block_group, bitmap_info, offset,
543 end - offset);
544 bytes -= end - offset;
545 offset = end;
546 added = 0;
547 } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
548 bitmap_set_bits(block_group, bitmap_info, offset, bytes);
549 bytes = 0;
550 } else {
551 BUG();
552 }
553
554 if (!bytes) {
555 ret = 1;
556 goto out;
557 } else
558 goto again;
559
560new_bitmap:
561 if (info && info->bitmap) {
562 add_new_bitmap(block_group, info, offset);
563 added = 1;
564 info = NULL;
565 goto again;
566 } else {
567 spin_unlock(&block_group->tree_lock);
568
569 /* no pre-allocated info, allocate a new one */
570 if (!info) {
571 info = kzalloc(sizeof(struct btrfs_free_space),
572 GFP_NOFS);
573 if (!info) {
574 spin_lock(&block_group->tree_lock);
575 ret = -ENOMEM;
576 goto out;
577 }
578 }
579
580 /* allocate the bitmap */
581 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
582 spin_lock(&block_group->tree_lock);
583 if (!info->bitmap) {
584 ret = -ENOMEM;
585 goto out;
586 }
587 goto again;
588 }
589
590out:
591 if (info) {
592 if (info->bitmap)
593 kfree(info->bitmap);
594 kfree(info);
595 }
204 596
205 return ret; 597 return ret;
206} 598}
@@ -208,8 +600,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
208int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 600int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
209 u64 offset, u64 bytes) 601 u64 offset, u64 bytes)
210{ 602{
211 struct btrfs_free_space *right_info; 603 struct btrfs_free_space *right_info = NULL;
212 struct btrfs_free_space *left_info; 604 struct btrfs_free_space *left_info = NULL;
213 struct btrfs_free_space *info = NULL; 605 struct btrfs_free_space *info = NULL;
214 int ret = 0; 606 int ret = 0;
215 607
@@ -227,18 +619,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
227 * are adding, if there is remove that struct and add a new one to 619 * are adding, if there is remove that struct and add a new one to
228 * cover the entire range 620 * cover the entire range
229 */ 621 */
230 right_info = tree_search_offset(&block_group->free_space_offset, 622 right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
231 offset+bytes, 0, 0); 623 if (right_info && rb_prev(&right_info->offset_index))
232 left_info = tree_search_offset(&block_group->free_space_offset, 624 left_info = rb_entry(rb_prev(&right_info->offset_index),
233 offset-1, 0, 1); 625 struct btrfs_free_space, offset_index);
626 else
627 left_info = tree_search_offset(block_group, offset - 1, 0, 0);
628
629 /*
630 * If there was no extent directly to the left or right of this new
631 * extent then we know we're going to have to allocate a new extent, so
632 * before we do that see if we need to drop this into a bitmap
633 */
634 if ((!left_info || left_info->bitmap) &&
635 (!right_info || right_info->bitmap)) {
636 ret = insert_into_bitmap(block_group, info);
637
638 if (ret < 0) {
639 goto out;
640 } else if (ret) {
641 ret = 0;
642 goto out;
643 }
644 }
234 645
235 if (right_info) { 646 if (right_info && !right_info->bitmap) {
236 unlink_free_space(block_group, right_info); 647 unlink_free_space(block_group, right_info);
237 info->bytes += right_info->bytes; 648 info->bytes += right_info->bytes;
238 kfree(right_info); 649 kfree(right_info);
239 } 650 }
240 651
241 if (left_info && left_info->offset + left_info->bytes == offset) { 652 if (left_info && !left_info->bitmap &&
653 left_info->offset + left_info->bytes == offset) {
242 unlink_free_space(block_group, left_info); 654 unlink_free_space(block_group, left_info);
243 info->offset = left_info->offset; 655 info->offset = left_info->offset;
244 info->bytes += left_info->bytes; 656 info->bytes += left_info->bytes;
@@ -248,11 +660,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
248 ret = link_free_space(block_group, info); 660 ret = link_free_space(block_group, info);
249 if (ret) 661 if (ret)
250 kfree(info); 662 kfree(info);
251 663out:
252 spin_unlock(&block_group->tree_lock); 664 spin_unlock(&block_group->tree_lock);
253 665
254 if (ret) { 666 if (ret) {
255 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 667 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
256 BUG_ON(ret == -EEXIST); 668 BUG_ON(ret == -EEXIST);
257 } 669 }
258 670
@@ -263,40 +675,74 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
263 u64 offset, u64 bytes) 675 u64 offset, u64 bytes)
264{ 676{
265 struct btrfs_free_space *info; 677 struct btrfs_free_space *info;
678 struct btrfs_free_space *next_info = NULL;
266 int ret = 0; 679 int ret = 0;
267 680
268 spin_lock(&block_group->tree_lock); 681 spin_lock(&block_group->tree_lock);
269 682
270 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 683again:
271 1); 684 info = tree_search_offset(block_group, offset, 0, 0);
272 if (info && info->offset == offset) { 685 if (!info) {
273 if (info->bytes < bytes) { 686 /*
274 printk(KERN_ERR "Found free space at %llu, size %llu," 687 * oops didn't find an extent that matched the space we wanted
275 "trying to use %llu\n", 688 * to remove, look for a bitmap instead
276 (unsigned long long)info->offset, 689 */
277 (unsigned long long)info->bytes, 690 info = tree_search_offset(block_group,
278 (unsigned long long)bytes); 691 offset_to_bitmap(block_group, offset),
692 1, 0);
693 if (!info) {
694 WARN_ON(1);
695 goto out_lock;
696 }
697 }
698
699 if (info->bytes < bytes && rb_next(&info->offset_index)) {
700 u64 end;
701 next_info = rb_entry(rb_next(&info->offset_index),
702 struct btrfs_free_space,
703 offset_index);
704
705 if (next_info->bitmap)
706 end = next_info->offset + BITS_PER_BITMAP *
707 block_group->sectorsize - 1;
708 else
709 end = next_info->offset + next_info->bytes;
710
711 if (next_info->bytes < bytes ||
712 next_info->offset > offset || offset > end) {
713 printk(KERN_CRIT "Found free space at %llu, size %llu,"
714 " trying to use %llu\n",
715 (unsigned long long)info->offset,
716 (unsigned long long)info->bytes,
717 (unsigned long long)bytes);
279 WARN_ON(1); 718 WARN_ON(1);
280 ret = -EINVAL; 719 ret = -EINVAL;
281 spin_unlock(&block_group->tree_lock); 720 goto out_lock;
282 goto out;
283 } 721 }
284 unlink_free_space(block_group, info);
285 722
286 if (info->bytes == bytes) { 723 info = next_info;
287 kfree(info); 724 }
288 spin_unlock(&block_group->tree_lock); 725
289 goto out; 726 if (info->bytes == bytes) {
727 unlink_free_space(block_group, info);
728 if (info->bitmap) {
729 kfree(info->bitmap);
730 block_group->total_bitmaps--;
290 } 731 }
732 kfree(info);
733 goto out_lock;
734 }
291 735
736 if (!info->bitmap && info->offset == offset) {
737 unlink_free_space(block_group, info);
292 info->offset += bytes; 738 info->offset += bytes;
293 info->bytes -= bytes; 739 info->bytes -= bytes;
740 link_free_space(block_group, info);
741 goto out_lock;
742 }
294 743
295 ret = link_free_space(block_group, info); 744 if (!info->bitmap && info->offset <= offset &&
296 spin_unlock(&block_group->tree_lock); 745 info->offset + info->bytes >= offset + bytes) {
297 BUG_ON(ret);
298 } else if (info && info->offset < offset &&
299 info->offset + info->bytes >= offset + bytes) {
300 u64 old_start = info->offset; 746 u64 old_start = info->offset;
301 /* 747 /*
302 * we're freeing space in the middle of the info, 748 * we're freeing space in the middle of the info,
@@ -312,7 +758,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
312 info->offset = offset + bytes; 758 info->offset = offset + bytes;
313 info->bytes = old_end - info->offset; 759 info->bytes = old_end - info->offset;
314 ret = link_free_space(block_group, info); 760 ret = link_free_space(block_group, info);
315 BUG_ON(ret); 761 WARN_ON(ret);
762 if (ret)
763 goto out_lock;
316 } else { 764 } else {
317 /* the hole we're creating ends at the end 765 /* the hole we're creating ends at the end
318 * of the info struct, just free the info 766 * of the info struct, just free the info
@@ -320,32 +768,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
320 kfree(info); 768 kfree(info);
321 } 769 }
322 spin_unlock(&block_group->tree_lock); 770 spin_unlock(&block_group->tree_lock);
323 /* step two, insert a new info struct to cover anything 771
324 * before the hole 772 /* step two, insert a new info struct to cover
773 * anything before the hole
325 */ 774 */
326 ret = btrfs_add_free_space(block_group, old_start, 775 ret = btrfs_add_free_space(block_group, old_start,
327 offset - old_start); 776 offset - old_start);
328 BUG_ON(ret); 777 WARN_ON(ret);
329 } else { 778 goto out;
330 spin_unlock(&block_group->tree_lock);
331 if (!info) {
332 printk(KERN_ERR "couldn't find space %llu to free\n",
333 (unsigned long long)offset);
334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
335 block_group->cached,
336 (unsigned long long)block_group->key.objectid,
337 (unsigned long long)block_group->key.offset);
338 btrfs_dump_free_space(block_group, bytes);
339 } else if (info) {
340 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
341 "but wanted offset=%llu bytes=%llu\n",
342 (unsigned long long)info->offset,
343 (unsigned long long)info->bytes,
344 (unsigned long long)offset,
345 (unsigned long long)bytes);
346 }
347 WARN_ON(1);
348 } 779 }
780
781 ret = remove_from_bitmap(block_group, info, &offset, &bytes);
782 if (ret == -EAGAIN)
783 goto again;
784 BUG_ON(ret);
785out_lock:
786 spin_unlock(&block_group->tree_lock);
349out: 787out:
350 return ret; 788 return ret;
351} 789}
@@ -361,10 +799,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
361 info = rb_entry(n, struct btrfs_free_space, offset_index); 799 info = rb_entry(n, struct btrfs_free_space, offset_index);
362 if (info->bytes >= bytes) 800 if (info->bytes >= bytes)
363 count++; 801 count++;
364 printk(KERN_ERR "entry offset %llu, bytes %llu\n", 802 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
365 (unsigned long long)info->offset, 803 (unsigned long long)info->offset,
366 (unsigned long long)info->bytes); 804 (unsigned long long)info->bytes,
805 (info->bitmap) ? "yes" : "no");
367 } 806 }
807 printk(KERN_INFO "block group has cluster?: %s\n",
808 list_empty(&block_group->cluster_list) ? "no" : "yes");
368 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 809 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
369 "\n", count); 810 "\n", count);
370} 811}
@@ -397,26 +838,35 @@ __btrfs_return_cluster_to_free_space(
397{ 838{
398 struct btrfs_free_space *entry; 839 struct btrfs_free_space *entry;
399 struct rb_node *node; 840 struct rb_node *node;
841 bool bitmap;
400 842
401 spin_lock(&cluster->lock); 843 spin_lock(&cluster->lock);
402 if (cluster->block_group != block_group) 844 if (cluster->block_group != block_group)
403 goto out; 845 goto out;
404 846
847 bitmap = cluster->points_to_bitmap;
848 cluster->block_group = NULL;
405 cluster->window_start = 0; 849 cluster->window_start = 0;
850 list_del_init(&cluster->block_group_list);
851 cluster->points_to_bitmap = false;
852
853 if (bitmap)
854 goto out;
855
406 node = rb_first(&cluster->root); 856 node = rb_first(&cluster->root);
407 while(node) { 857 while (node) {
408 entry = rb_entry(node, struct btrfs_free_space, offset_index); 858 entry = rb_entry(node, struct btrfs_free_space, offset_index);
409 node = rb_next(&entry->offset_index); 859 node = rb_next(&entry->offset_index);
410 rb_erase(&entry->offset_index, &cluster->root); 860 rb_erase(&entry->offset_index, &cluster->root);
411 link_free_space(block_group, entry); 861 BUG_ON(entry->bitmap);
862 tree_insert_offset(&block_group->free_space_offset,
863 entry->offset, &entry->offset_index, 0);
412 } 864 }
413 list_del_init(&cluster->block_group_list);
414
415 btrfs_put_block_group(cluster->block_group);
416 cluster->block_group = NULL;
417 cluster->root.rb_node = NULL; 865 cluster->root.rb_node = NULL;
866
418out: 867out:
419 spin_unlock(&cluster->lock); 868 spin_unlock(&cluster->lock);
869 btrfs_put_block_group(block_group);
420 return 0; 870 return 0;
421} 871}
422 872
@@ -425,20 +875,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
425 struct btrfs_free_space *info; 875 struct btrfs_free_space *info;
426 struct rb_node *node; 876 struct rb_node *node;
427 struct btrfs_free_cluster *cluster; 877 struct btrfs_free_cluster *cluster;
428 struct btrfs_free_cluster *safe; 878 struct list_head *head;
429 879
430 spin_lock(&block_group->tree_lock); 880 spin_lock(&block_group->tree_lock);
431 881 while ((head = block_group->cluster_list.next) !=
432 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, 882 &block_group->cluster_list) {
433 block_group_list) { 883 cluster = list_entry(head, struct btrfs_free_cluster,
884 block_group_list);
434 885
435 WARN_ON(cluster->block_group != block_group); 886 WARN_ON(cluster->block_group != block_group);
436 __btrfs_return_cluster_to_free_space(block_group, cluster); 887 __btrfs_return_cluster_to_free_space(block_group, cluster);
888 if (need_resched()) {
889 spin_unlock(&block_group->tree_lock);
890 cond_resched();
891 spin_lock(&block_group->tree_lock);
892 }
437 } 893 }
438 894
439 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 895 while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
440 info = rb_entry(node, struct btrfs_free_space, bytes_index); 896 info = rb_entry(node, struct btrfs_free_space, offset_index);
441 unlink_free_space(block_group, info); 897 unlink_free_space(block_group, info);
898 if (info->bitmap)
899 kfree(info->bitmap);
442 kfree(info); 900 kfree(info);
443 if (need_resched()) { 901 if (need_resched()) {
444 spin_unlock(&block_group->tree_lock); 902 spin_unlock(&block_group->tree_lock);
@@ -446,6 +904,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
446 spin_lock(&block_group->tree_lock); 904 spin_lock(&block_group->tree_lock);
447 } 905 }
448 } 906 }
907
449 spin_unlock(&block_group->tree_lock); 908 spin_unlock(&block_group->tree_lock);
450} 909}
451 910
@@ -453,25 +912,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
453 u64 offset, u64 bytes, u64 empty_size) 912 u64 offset, u64 bytes, u64 empty_size)
454{ 913{
455 struct btrfs_free_space *entry = NULL; 914 struct btrfs_free_space *entry = NULL;
915 u64 bytes_search = bytes + empty_size;
456 u64 ret = 0; 916 u64 ret = 0;
457 917
458 spin_lock(&block_group->tree_lock); 918 spin_lock(&block_group->tree_lock);
459 entry = tree_search_offset(&block_group->free_space_offset, offset, 919 entry = find_free_space(block_group, &offset, &bytes_search, 0);
460 bytes + empty_size, 1);
461 if (!entry) 920 if (!entry)
462 entry = tree_search_bytes(&block_group->free_space_bytes, 921 goto out;
463 offset, bytes + empty_size); 922
464 if (entry) { 923 ret = offset;
924 if (entry->bitmap) {
925 bitmap_clear_bits(block_group, entry, offset, bytes);
926 if (!entry->bytes) {
927 unlink_free_space(block_group, entry);
928 kfree(entry->bitmap);
929 kfree(entry);
930 block_group->total_bitmaps--;
931 recalculate_thresholds(block_group);
932 }
933 } else {
465 unlink_free_space(block_group, entry); 934 unlink_free_space(block_group, entry);
466 ret = entry->offset;
467 entry->offset += bytes; 935 entry->offset += bytes;
468 entry->bytes -= bytes; 936 entry->bytes -= bytes;
469
470 if (!entry->bytes) 937 if (!entry->bytes)
471 kfree(entry); 938 kfree(entry);
472 else 939 else
473 link_free_space(block_group, entry); 940 link_free_space(block_group, entry);
474 } 941 }
942
943out:
475 spin_unlock(&block_group->tree_lock); 944 spin_unlock(&block_group->tree_lock);
476 945
477 return ret; 946 return ret;
@@ -517,6 +986,54 @@ int btrfs_return_cluster_to_free_space(
517 return ret; 986 return ret;
518} 987}
519 988
989static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
990 struct btrfs_free_cluster *cluster,
991 u64 bytes, u64 min_start)
992{
993 struct btrfs_free_space *entry;
994 int err;
995 u64 search_start = cluster->window_start;
996 u64 search_bytes = bytes;
997 u64 ret = 0;
998
999 spin_lock(&block_group->tree_lock);
1000 spin_lock(&cluster->lock);
1001
1002 if (!cluster->points_to_bitmap)
1003 goto out;
1004
1005 if (cluster->block_group != block_group)
1006 goto out;
1007
1008 /*
1009 * search_start is the beginning of the bitmap, but at some point it may
1010 * be a good idea to point to the actual start of the free area in the
1011 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
1012 * to 1 to make sure we get the bitmap entry
1013 */
1014 entry = tree_search_offset(block_group,
1015 offset_to_bitmap(block_group, search_start),
1016 1, 0);
1017 if (!entry || !entry->bitmap)
1018 goto out;
1019
1020 search_start = min_start;
1021 search_bytes = bytes;
1022
1023 err = search_bitmap(block_group, entry, &search_start,
1024 &search_bytes);
1025 if (err)
1026 goto out;
1027
1028 ret = search_start;
1029 bitmap_clear_bits(block_group, entry, ret, bytes);
1030out:
1031 spin_unlock(&cluster->lock);
1032 spin_unlock(&block_group->tree_lock);
1033
1034 return ret;
1035}
1036
520/* 1037/*
521 * given a cluster, try to allocate 'bytes' from it, returns 0 1038 * given a cluster, try to allocate 'bytes' from it, returns 0
522 * if it couldn't find anything suitably large, or a logical disk offset 1039 * if it couldn't find anything suitably large, or a logical disk offset
@@ -530,6 +1047,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
530 struct rb_node *node; 1047 struct rb_node *node;
531 u64 ret = 0; 1048 u64 ret = 0;
532 1049
1050 if (cluster->points_to_bitmap)
1051 return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
1052 min_start);
1053
533 spin_lock(&cluster->lock); 1054 spin_lock(&cluster->lock);
534 if (bytes > cluster->max_size) 1055 if (bytes > cluster->max_size)
535 goto out; 1056 goto out;
@@ -567,9 +1088,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
567 } 1088 }
568out: 1089out:
569 spin_unlock(&cluster->lock); 1090 spin_unlock(&cluster->lock);
1091
570 return ret; 1092 return ret;
571} 1093}
572 1094
1095static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1096 struct btrfs_free_space *entry,
1097 struct btrfs_free_cluster *cluster,
1098 u64 offset, u64 bytes, u64 min_bytes)
1099{
1100 unsigned long next_zero;
1101 unsigned long i;
1102 unsigned long search_bits;
1103 unsigned long total_bits;
1104 unsigned long found_bits;
1105 unsigned long start = 0;
1106 unsigned long total_found = 0;
1107 bool found = false;
1108
1109 i = offset_to_bit(entry->offset, block_group->sectorsize,
1110 max_t(u64, offset, entry->offset));
1111 search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
1112 total_bits = bytes_to_bits(bytes, block_group->sectorsize);
1113
1114again:
1115 found_bits = 0;
1116 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
1117 i < BITS_PER_BITMAP;
1118 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
1119 next_zero = find_next_zero_bit(entry->bitmap,
1120 BITS_PER_BITMAP, i);
1121 if (next_zero - i >= search_bits) {
1122 found_bits = next_zero - i;
1123 break;
1124 }
1125 i = next_zero;
1126 }
1127
1128 if (!found_bits)
1129 return -1;
1130
1131 if (!found) {
1132 start = i;
1133 found = true;
1134 }
1135
1136 total_found += found_bits;
1137
1138 if (cluster->max_size < found_bits * block_group->sectorsize)
1139 cluster->max_size = found_bits * block_group->sectorsize;
1140
1141 if (total_found < total_bits) {
1142 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
1143 if (i - start > total_bits * 2) {
1144 total_found = 0;
1145 cluster->max_size = 0;
1146 found = false;
1147 }
1148 goto again;
1149 }
1150
1151 cluster->window_start = start * block_group->sectorsize +
1152 entry->offset;
1153 cluster->points_to_bitmap = true;
1154
1155 return 0;
1156}
1157
573/* 1158/*
574 * here we try to find a cluster of blocks in a block group. The goal 1159 * here we try to find a cluster of blocks in a block group. The goal
575 * is to find at least bytes free and up to empty_size + bytes free. 1160 * is to find at least bytes free and up to empty_size + bytes free.
@@ -587,12 +1172,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
587 struct btrfs_free_space *entry = NULL; 1172 struct btrfs_free_space *entry = NULL;
588 struct rb_node *node; 1173 struct rb_node *node;
589 struct btrfs_free_space *next; 1174 struct btrfs_free_space *next;
590 struct btrfs_free_space *last; 1175 struct btrfs_free_space *last = NULL;
591 u64 min_bytes; 1176 u64 min_bytes;
592 u64 window_start; 1177 u64 window_start;
593 u64 window_free; 1178 u64 window_free;
594 u64 max_extent = 0; 1179 u64 max_extent = 0;
595 int total_retries = 0; 1180 bool found_bitmap = false;
596 int ret; 1181 int ret;
597 1182
598 /* for metadata, allow allocates with more holes */ 1183 /* for metadata, allow allocates with more holes */
@@ -620,31 +1205,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
620 goto out; 1205 goto out;
621 } 1206 }
622again: 1207again:
623 min_bytes = min(min_bytes, bytes + empty_size); 1208 entry = tree_search_offset(block_group, offset, found_bitmap, 1);
624 entry = tree_search_bytes(&block_group->free_space_bytes,
625 offset, min_bytes);
626 if (!entry) { 1209 if (!entry) {
627 ret = -ENOSPC; 1210 ret = -ENOSPC;
628 goto out; 1211 goto out;
629 } 1212 }
1213
1214 /*
1215 * If found_bitmap is true, we exhausted our search for extent entries,
1216 * and we just want to search all of the bitmaps that we can find, and
1217 * ignore any extent entries we find.
1218 */
1219 while (entry->bitmap || found_bitmap ||
1220 (!entry->bitmap && entry->bytes < min_bytes)) {
1221 struct rb_node *node = rb_next(&entry->offset_index);
1222
1223 if (entry->bitmap && entry->bytes > bytes + empty_size) {
1224 ret = btrfs_bitmap_cluster(block_group, entry, cluster,
1225 offset, bytes + empty_size,
1226 min_bytes);
1227 if (!ret)
1228 goto got_it;
1229 }
1230
1231 if (!node) {
1232 ret = -ENOSPC;
1233 goto out;
1234 }
1235 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1236 }
1237
1238 /*
1239 * We already searched all the extent entries from the passed in offset
1240 * to the end and didn't find enough space for the cluster, and we also
1241 * didn't find any bitmaps that met our criteria, just go ahead and exit
1242 */
1243 if (found_bitmap) {
1244 ret = -ENOSPC;
1245 goto out;
1246 }
1247
1248 cluster->points_to_bitmap = false;
630 window_start = entry->offset; 1249 window_start = entry->offset;
631 window_free = entry->bytes; 1250 window_free = entry->bytes;
632 last = entry; 1251 last = entry;
633 max_extent = entry->bytes; 1252 max_extent = entry->bytes;
634 1253
635 while(1) { 1254 while (1) {
636 /* out window is just right, lets fill it */ 1255 /* out window is just right, lets fill it */
637 if (window_free >= bytes + empty_size) 1256 if (window_free >= bytes + empty_size)
638 break; 1257 break;
639 1258
640 node = rb_next(&last->offset_index); 1259 node = rb_next(&last->offset_index);
641 if (!node) { 1260 if (!node) {
1261 if (found_bitmap)
1262 goto again;
642 ret = -ENOSPC; 1263 ret = -ENOSPC;
643 goto out; 1264 goto out;
644 } 1265 }
645 next = rb_entry(node, struct btrfs_free_space, offset_index); 1266 next = rb_entry(node, struct btrfs_free_space, offset_index);
646 1267
647 /* 1268 /*
1269 * we found a bitmap, so if this search doesn't result in a
1270 * cluster, we know to go and search again for the bitmaps and
1271 * start looking for space there
1272 */
1273 if (next->bitmap) {
1274 if (!found_bitmap)
1275 offset = next->offset;
1276 found_bitmap = true;
1277 last = next;
1278 continue;
1279 }
1280
1281 /*
648 * we haven't filled the empty size and the window is 1282 * we haven't filled the empty size and the window is
649 * very large. reset and try again 1283 * very large. reset and try again
650 */ 1284 */
@@ -655,19 +1289,6 @@ again:
655 window_free = entry->bytes; 1289 window_free = entry->bytes;
656 last = entry; 1290 last = entry;
657 max_extent = 0; 1291 max_extent = 0;
658 total_retries++;
659 if (total_retries % 64 == 0) {
660 if (min_bytes >= (bytes + empty_size)) {
661 ret = -ENOSPC;
662 goto out;
663 }
664 /*
665 * grow our allocation a bit, we're not having
666 * much luck
667 */
668 min_bytes *= 2;
669 goto again;
670 }
671 } else { 1292 } else {
672 last = next; 1293 last = next;
673 window_free += next->bytes; 1294 window_free += next->bytes;
@@ -685,11 +1306,19 @@ again:
685 * The cluster includes an rbtree, but only uses the offset index 1306 * The cluster includes an rbtree, but only uses the offset index
686 * of each free space cache entry. 1307 * of each free space cache entry.
687 */ 1308 */
688 while(1) { 1309 while (1) {
689 node = rb_next(&entry->offset_index); 1310 node = rb_next(&entry->offset_index);
690 unlink_free_space(block_group, entry); 1311 if (entry->bitmap && node) {
1312 entry = rb_entry(node, struct btrfs_free_space,
1313 offset_index);
1314 continue;
1315 } else if (entry->bitmap && !node) {
1316 break;
1317 }
1318
1319 rb_erase(&entry->offset_index, &block_group->free_space_offset);
691 ret = tree_insert_offset(&cluster->root, entry->offset, 1320 ret = tree_insert_offset(&cluster->root, entry->offset,
692 &entry->offset_index); 1321 &entry->offset_index, 0);
693 BUG_ON(ret); 1322 BUG_ON(ret);
694 1323
695 if (!node || entry == last) 1324 if (!node || entry == last)
@@ -697,8 +1326,10 @@ again:
697 1326
698 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1327 entry = rb_entry(node, struct btrfs_free_space, offset_index);
699 } 1328 }
700 ret = 0; 1329
701 cluster->max_size = max_extent; 1330 cluster->max_size = max_extent;
1331got_it:
1332 ret = 0;
702 atomic_inc(&block_group->count); 1333 atomic_inc(&block_group->count);
703 list_add_tail(&cluster->block_group_list, &block_group->cluster_list); 1334 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
704 cluster->block_group = block_group; 1335 cluster->block_group = block_group;
@@ -718,6 +1349,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
718 spin_lock_init(&cluster->refill_lock); 1349 spin_lock_init(&cluster->refill_lock);
719 cluster->root.rb_node = NULL; 1350 cluster->root.rb_node = NULL;
720 cluster->max_size = 0; 1351 cluster->max_size = 0;
1352 cluster->points_to_bitmap = false;
721 INIT_LIST_HEAD(&cluster->block_group_list); 1353 INIT_LIST_HEAD(&cluster->block_group_list);
722 cluster->block_group = NULL; 1354 cluster->block_group = NULL;
723} 1355}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 266fb8764054..890a8e79011b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -19,6 +19,14 @@
19#ifndef __BTRFS_FREE_SPACE_CACHE 19#ifndef __BTRFS_FREE_SPACE_CACHE
20#define __BTRFS_FREE_SPACE_CACHE 20#define __BTRFS_FREE_SPACE_CACHE
21 21
22struct btrfs_free_space {
23 struct rb_node offset_index;
24 u64 offset;
25 u64 bytes;
26 unsigned long *bitmap;
27 struct list_head list;
28};
29
22int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 30int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
23 u64 bytenr, u64 size); 31 u64 bytenr, u64 size);
24int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 32int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dbe1aabf96cd..272b9b2bea86 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -26,7 +26,6 @@
26#include <linux/time.h> 26#include <linux/time.h>
27#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
31#include <linux/mpage.h> 30#include <linux/mpage.h>
32#include <linux/swap.h> 31#include <linux/swap.h>
@@ -2604,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2604 if (root->ref_cows) 2603 if (root->ref_cows)
2605 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 2604 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2606 path = btrfs_alloc_path(); 2605 path = btrfs_alloc_path();
2607 path->reada = -1;
2608 BUG_ON(!path); 2606 BUG_ON(!path);
2607 path->reada = -1;
2609 2608
2610 /* FIXME, add redo link to tree so we don't leak on crash */ 2609 /* FIXME, add redo link to tree so we don't leak on crash */
2611 key.objectid = inode->i_ino; 2610 key.objectid = inode->i_ino;
@@ -3580,12 +3579,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3580 owner = 1; 3579 owner = 1;
3581 BTRFS_I(inode)->block_group = 3580 BTRFS_I(inode)->block_group =
3582 btrfs_find_block_group(root, 0, alloc_hint, owner); 3581 btrfs_find_block_group(root, 0, alloc_hint, owner);
3583 if ((mode & S_IFREG)) {
3584 if (btrfs_test_opt(root, NODATASUM))
3585 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
3586 if (btrfs_test_opt(root, NODATACOW))
3587 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
3588 }
3589 3582
3590 key[0].objectid = objectid; 3583 key[0].objectid = objectid;
3591 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 3584 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -3640,6 +3633,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3640 3633
3641 btrfs_inherit_iflags(inode, dir); 3634 btrfs_inherit_iflags(inode, dir);
3642 3635
3636 if ((mode & S_IFREG)) {
3637 if (btrfs_test_opt(root, NODATASUM))
3638 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
3639 if (btrfs_test_opt(root, NODATACOW))
3640 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
3641 }
3642
3643 insert_inode_hash(inode); 3643 insert_inode_hash(inode);
3644 inode_tree_add(inode); 3644 inode_tree_add(inode);
3645 return inode; 3645 return inode;
@@ -4785,8 +4785,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4785 * and the replacement file is large. Start IO on it now so 4785 * and the replacement file is large. Start IO on it now so
4786 * we don't add too much work to the end of the transaction 4786 * we don't add too much work to the end of the transaction
4787 */ 4787 */
4788 if (new_inode && old_inode && S_ISREG(old_inode->i_mode) && 4788 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
4789 new_inode->i_size &&
4790 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) 4789 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
4791 filemap_flush(old_inode->i_mapping); 4790 filemap_flush(old_inode->i_mapping);
4792 4791
@@ -5082,6 +5081,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5082 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 5081 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
5083 struct extent_map *em; 5082 struct extent_map *em;
5084 struct btrfs_trans_handle *trans; 5083 struct btrfs_trans_handle *trans;
5084 struct btrfs_root *root;
5085 int ret; 5085 int ret;
5086 5086
5087 alloc_start = offset & ~mask; 5087 alloc_start = offset & ~mask;
@@ -5100,6 +5100,13 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5100 goto out; 5100 goto out;
5101 } 5101 }
5102 5102
5103 root = BTRFS_I(inode)->root;
5104
5105 ret = btrfs_check_data_free_space(root, inode,
5106 alloc_end - alloc_start);
5107 if (ret)
5108 goto out;
5109
5103 locked_end = alloc_end - 1; 5110 locked_end = alloc_end - 1;
5104 while (1) { 5111 while (1) {
5105 struct btrfs_ordered_extent *ordered; 5112 struct btrfs_ordered_extent *ordered;
@@ -5107,7 +5114,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5107 trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); 5114 trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
5108 if (!trans) { 5115 if (!trans) {
5109 ret = -EIO; 5116 ret = -EIO;
5110 goto out; 5117 goto out_free;
5111 } 5118 }
5112 5119
5113 /* the extent lock is ordered inside the running 5120 /* the extent lock is ordered inside the running
@@ -5168,6 +5175,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5168 GFP_NOFS); 5175 GFP_NOFS);
5169 5176
5170 btrfs_end_transaction(trans, BTRFS_I(inode)->root); 5177 btrfs_end_transaction(trans, BTRFS_I(inode)->root);
5178out_free:
5179 btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start);
5171out: 5180out:
5172 mutex_unlock(&inode->i_mutex); 5181 mutex_unlock(&inode->i_mutex);
5173 return ret; 5182 return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index eff18f5b5362..bd88f25889f7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -27,7 +27,6 @@
27#include <linux/time.h> 27#include <linux/time.h>
28#include <linux/init.h> 28#include <linux/init.h>
29#include <linux/string.h> 29#include <linux/string.h>
30#include <linux/smp_lock.h>
31#include <linux/backing-dev.h> 30#include <linux/backing-dev.h>
32#include <linux/mount.h> 31#include <linux/mount.h>
33#include <linux/mpage.h> 32#include <linux/mpage.h>
@@ -1028,7 +1027,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1028 struct btrfs_file_extent_item); 1027 struct btrfs_file_extent_item);
1029 comp = btrfs_file_extent_compression(leaf, extent); 1028 comp = btrfs_file_extent_compression(leaf, extent);
1030 type = btrfs_file_extent_type(leaf, extent); 1029 type = btrfs_file_extent_type(leaf, extent);
1031 if (type == BTRFS_FILE_EXTENT_REG) { 1030 if (type == BTRFS_FILE_EXTENT_REG ||
1031 type == BTRFS_FILE_EXTENT_PREALLOC) {
1032 disko = btrfs_file_extent_disk_bytenr(leaf, 1032 disko = btrfs_file_extent_disk_bytenr(leaf,
1033 extent); 1033 extent);
1034 diskl = btrfs_file_extent_disk_num_bytes(leaf, 1034 diskl = btrfs_file_extent_disk_num_bytes(leaf,
@@ -1051,7 +1051,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1051 new_key.objectid = inode->i_ino; 1051 new_key.objectid = inode->i_ino;
1052 new_key.offset = key.offset + destoff - off; 1052 new_key.offset = key.offset + destoff - off;
1053 1053
1054 if (type == BTRFS_FILE_EXTENT_REG) { 1054 if (type == BTRFS_FILE_EXTENT_REG ||
1055 type == BTRFS_FILE_EXTENT_PREALLOC) {
1055 ret = btrfs_insert_empty_item(trans, root, path, 1056 ret = btrfs_insert_empty_item(trans, root, path,
1056 &new_key, size); 1057 &new_key, size);
1057 if (ret) 1058 if (ret)
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6d6523da0a30..0d126be22b63 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
309 } 309 }
310 printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", 310 printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
311 (unsigned long long)btrfs_header_bytenr(c), 311 (unsigned long long)btrfs_header_bytenr(c),
312 btrfs_header_level(c), nr, 312 level, nr,
313 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); 313 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
314 for (i = 0; i < nr; i++) { 314 for (i = 0; i < nr; i++) {
315 btrfs_node_key_to_cpu(c, &key, i); 315 btrfs_node_key_to_cpu(c, &key, i);
@@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
326 btrfs_level_size(root, level - 1), 326 btrfs_level_size(root, level - 1),
327 btrfs_node_ptr_generation(c, i)); 327 btrfs_node_ptr_generation(c, i));
328 if (btrfs_is_leaf(next) && 328 if (btrfs_is_leaf(next) &&
329 btrfs_header_level(c) != 1) 329 level != 1)
330 BUG(); 330 BUG();
331 if (btrfs_header_level(next) != 331 if (btrfs_header_level(next) !=
332 btrfs_header_level(c) - 1) 332 level - 1)
333 BUG(); 333 BUG();
334 btrfs_print_tree(root, next); 334 btrfs_print_tree(root, next);
335 free_extent_buffer(next); 335 free_extent_buffer(next);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b23dc209ae10..c04f7f212602 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -670,6 +670,8 @@ again:
670 err = ret; 670 err = ret;
671 goto out; 671 goto out;
672 } 672 }
673 if (ret > 0 && path2->slots[level] > 0)
674 path2->slots[level]--;
673 675
674 eb = path2->nodes[level]; 676 eb = path2->nodes[level];
675 WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != 677 WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
@@ -1609,6 +1611,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1609 BUG_ON(level == 0); 1611 BUG_ON(level == 0);
1610 path->lowest_level = level; 1612 path->lowest_level = level;
1611 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1613 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1614 path->lowest_level = 0;
1612 if (ret < 0) { 1615 if (ret < 0) {
1613 btrfs_free_path(path); 1616 btrfs_free_path(path);
1614 return ret; 1617 return ret;
@@ -1788,7 +1791,7 @@ static void merge_func(struct btrfs_work *work)
1788 btrfs_end_transaction(trans, root); 1791 btrfs_end_transaction(trans, root);
1789 } 1792 }
1790 1793
1791 btrfs_drop_dead_root(reloc_root); 1794 btrfs_drop_snapshot(reloc_root, 0);
1792 1795
1793 if (atomic_dec_and_test(async->num_pending)) 1796 if (atomic_dec_and_test(async->num_pending))
1794 complete(async->done); 1797 complete(async->done);
@@ -2075,9 +2078,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2075 2078
2076 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2079 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2077 BUG_ON(ret); 2080 BUG_ON(ret);
2078
2079 btrfs_tree_unlock(eb);
2080 free_extent_buffer(eb);
2081 } 2081 }
2082 if (!lowest) { 2082 if (!lowest) {
2083 btrfs_tree_unlock(upper->eb); 2083 btrfs_tree_unlock(upper->eb);
@@ -2553,8 +2553,13 @@ int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
2553 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; 2553 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
2554 2554
2555 /* make sure the dirty trick played by the caller work */ 2555 /* make sure the dirty trick played by the caller work */
2556 ret = invalidate_inode_pages2_range(inode->i_mapping, 2556 while (1) {
2557 first_index, last_index); 2557 ret = invalidate_inode_pages2_range(inode->i_mapping,
2558 first_index, last_index);
2559 if (ret != -EBUSY)
2560 break;
2561 schedule_timeout(HZ/10);
2562 }
2558 if (ret) 2563 if (ret)
2559 goto out_unlock; 2564 goto out_unlock;
2560 2565
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9f179d4832d5..6d6d06cb6dfc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -26,7 +26,6 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/seq_file.h> 27#include <linux/seq_file.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/smp_lock.h>
30#include <linux/backing-dev.h> 29#include <linux/backing-dev.h>
31#include <linux/mount.h> 30#include <linux/mount.h>
32#include <linux/mpage.h> 31#include <linux/mpage.h>
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4e83457ea253..cdbb5022da52 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,6 +40,12 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
40 } 40 }
41} 41}
42 42
43static noinline void switch_commit_root(struct btrfs_root *root)
44{
45 free_extent_buffer(root->commit_root);
46 root->commit_root = btrfs_root_node(root);
47}
48
43/* 49/*
44 * either allocate a new transaction or hop into the existing one 50 * either allocate a new transaction or hop into the existing one
45 */ 51 */
@@ -444,9 +450,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
444 450
445 btrfs_write_dirty_block_groups(trans, root); 451 btrfs_write_dirty_block_groups(trans, root);
446 452
447 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
448 BUG_ON(ret);
449
450 while (1) { 453 while (1) {
451 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 454 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
452 if (old_root_bytenr == root->node->start) 455 if (old_root_bytenr == root->node->start)
@@ -457,13 +460,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
457 &root->root_key, 460 &root->root_key,
458 &root->root_item); 461 &root->root_item);
459 BUG_ON(ret); 462 BUG_ON(ret);
460 btrfs_write_dirty_block_groups(trans, root);
461 463
462 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 464 ret = btrfs_write_dirty_block_groups(trans, root);
463 BUG_ON(ret); 465 BUG_ON(ret);
464 } 466 }
465 free_extent_buffer(root->commit_root); 467
466 root->commit_root = btrfs_root_node(root); 468 if (root != root->fs_info->extent_root)
469 switch_commit_root(root);
470
467 return 0; 471 return 0;
468} 472}
469 473
@@ -495,10 +499,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
495 root = list_entry(next, struct btrfs_root, dirty_list); 499 root = list_entry(next, struct btrfs_root, dirty_list);
496 500
497 update_cowonly_root(trans, root); 501 update_cowonly_root(trans, root);
498
499 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
500 BUG_ON(ret);
501 } 502 }
503
504 down_write(&fs_info->extent_commit_sem);
505 switch_commit_root(fs_info->extent_root);
506 up_write(&fs_info->extent_commit_sem);
507
502 return 0; 508 return 0;
503} 509}
504 510
@@ -544,8 +550,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
544 btrfs_update_reloc_root(trans, root); 550 btrfs_update_reloc_root(trans, root);
545 551
546 if (root->commit_root != root->node) { 552 if (root->commit_root != root->node) {
547 free_extent_buffer(root->commit_root); 553 switch_commit_root(root);
548 root->commit_root = btrfs_root_node(root);
549 btrfs_set_root_node(&root->root_item, 554 btrfs_set_root_node(&root->root_item,
550 root->node); 555 root->node);
551 } 556 }
@@ -593,6 +598,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
593 return 0; 598 return 0;
594} 599}
595 600
601#if 0
596/* 602/*
597 * when dropping snapshots, we generate a ton of delayed refs, and it makes 603 * when dropping snapshots, we generate a ton of delayed refs, and it makes
598 * sense not to join the transaction while it is trying to flush the current 604 * sense not to join the transaction while it is trying to flush the current
@@ -681,6 +687,7 @@ int btrfs_drop_dead_root(struct btrfs_root *root)
681 btrfs_btree_balance_dirty(tree_root, nr); 687 btrfs_btree_balance_dirty(tree_root, nr);
682 return ret; 688 return ret;
683} 689}
690#endif
684 691
685/* 692/*
686 * new snapshots need to be created at a very specific time in the 693 * new snapshots need to be created at a very specific time in the
@@ -850,6 +857,16 @@ static void update_super_roots(struct btrfs_root *root)
850 super->root_level = root_item->level; 857 super->root_level = root_item->level;
851} 858}
852 859
860int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
861{
862 int ret = 0;
863 spin_lock(&info->new_trans_lock);
864 if (info->running_transaction)
865 ret = info->running_transaction->in_commit;
866 spin_unlock(&info->new_trans_lock);
867 return ret;
868}
869
853int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 870int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
854 struct btrfs_root *root) 871 struct btrfs_root *root)
855{ 872{
@@ -941,9 +958,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
941 958
942 mutex_unlock(&root->fs_info->trans_mutex); 959 mutex_unlock(&root->fs_info->trans_mutex);
943 960
944 if (flush_on_commit || snap_pending) { 961 if (flush_on_commit) {
945 if (flush_on_commit) 962 btrfs_start_delalloc_inodes(root);
946 btrfs_start_delalloc_inodes(root); 963 ret = btrfs_wait_ordered_extents(root, 0);
964 BUG_ON(ret);
965 } else if (snap_pending) {
947 ret = btrfs_wait_ordered_extents(root, 1); 966 ret = btrfs_wait_ordered_extents(root, 1);
948 BUG_ON(ret); 967 BUG_ON(ret);
949 } 968 }
@@ -1007,15 +1026,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1007 1026
1008 btrfs_set_root_node(&root->fs_info->tree_root->root_item, 1027 btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1009 root->fs_info->tree_root->node); 1028 root->fs_info->tree_root->node);
1010 free_extent_buffer(root->fs_info->tree_root->commit_root); 1029 switch_commit_root(root->fs_info->tree_root);
1011 root->fs_info->tree_root->commit_root =
1012 btrfs_root_node(root->fs_info->tree_root);
1013 1030
1014 btrfs_set_root_node(&root->fs_info->chunk_root->root_item, 1031 btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
1015 root->fs_info->chunk_root->node); 1032 root->fs_info->chunk_root->node);
1016 free_extent_buffer(root->fs_info->chunk_root->commit_root); 1033 switch_commit_root(root->fs_info->chunk_root);
1017 root->fs_info->chunk_root->commit_root =
1018 btrfs_root_node(root->fs_info->chunk_root);
1019 1034
1020 update_super_roots(root); 1035 update_super_roots(root);
1021 1036
@@ -1055,6 +1070,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1055 cur_trans->commit_done = 1; 1070 cur_trans->commit_done = 1;
1056 1071
1057 root->fs_info->last_trans_committed = cur_trans->transid; 1072 root->fs_info->last_trans_committed = cur_trans->transid;
1073
1058 wake_up(&cur_trans->commit_wait); 1074 wake_up(&cur_trans->commit_wait);
1059 1075
1060 put_transaction(cur_trans); 1076 put_transaction(cur_trans);
@@ -1081,7 +1097,7 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1081 while (!list_empty(&list)) { 1097 while (!list_empty(&list)) {
1082 root = list_entry(list.next, struct btrfs_root, root_list); 1098 root = list_entry(list.next, struct btrfs_root, root_list);
1083 list_del_init(&root->root_list); 1099 list_del_init(&root->root_list);
1084 btrfs_drop_dead_root(root); 1100 btrfs_drop_snapshot(root, 0);
1085 } 1101 }
1086 return 0; 1102 return 0;
1087} 1103}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 961c3ee5a2e1..663c67404918 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -107,4 +107,5 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
107 struct btrfs_root *root); 107 struct btrfs_root *root);
108int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 108int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
109 struct extent_io_tree *dirty_pages); 109 struct extent_io_tree *dirty_pages);
110int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
110#endif 111#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c13922206d1b..d91b0de7c502 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
797 return -ENOENT; 797 return -ENOENT;
798 798
799 inode = read_one_inode(root, key->objectid); 799 inode = read_one_inode(root, key->objectid);
800 BUG_ON(!dir); 800 BUG_ON(!inode);
801 801
802 ref_ptr = btrfs_item_ptr_offset(eb, slot); 802 ref_ptr = btrfs_item_ptr_offset(eb, slot);
803 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 803 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3ab80e9cd767..5dbefd11b4af 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -721,7 +721,8 @@ error:
721 */ 721 */
722static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, 722static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
723 struct btrfs_device *device, 723 struct btrfs_device *device,
724 u64 num_bytes, u64 *start) 724 u64 num_bytes, u64 *start,
725 u64 *max_avail)
725{ 726{
726 struct btrfs_key key; 727 struct btrfs_key key;
727 struct btrfs_root *root = device->dev_root; 728 struct btrfs_root *root = device->dev_root;
@@ -758,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
758 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 759 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
759 if (ret < 0) 760 if (ret < 0)
760 goto error; 761 goto error;
761 ret = btrfs_previous_item(root, path, 0, key.type); 762 if (ret > 0) {
762 if (ret < 0) 763 ret = btrfs_previous_item(root, path, key.objectid, key.type);
763 goto error; 764 if (ret < 0)
765 goto error;
766 if (ret > 0)
767 start_found = 1;
768 }
764 l = path->nodes[0]; 769 l = path->nodes[0];
765 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 770 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
766 while (1) { 771 while (1) {
@@ -803,6 +808,10 @@ no_more_items:
803 if (last_byte < search_start) 808 if (last_byte < search_start)
804 last_byte = search_start; 809 last_byte = search_start;
805 hole_size = key.offset - last_byte; 810 hole_size = key.offset - last_byte;
811
812 if (hole_size > *max_avail)
813 *max_avail = hole_size;
814
806 if (key.offset > last_byte && 815 if (key.offset > last_byte &&
807 hole_size >= num_bytes) { 816 hole_size >= num_bytes) {
808 *start = last_byte; 817 *start = last_byte;
@@ -1621,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1621 device->fs_devices->total_rw_bytes += diff; 1630 device->fs_devices->total_rw_bytes += diff;
1622 1631
1623 device->total_bytes = new_size; 1632 device->total_bytes = new_size;
1633 device->disk_total_bytes = new_size;
1624 btrfs_clear_space_info_full(device->dev_root->fs_info); 1634 btrfs_clear_space_info_full(device->dev_root->fs_info);
1625 1635
1626 return btrfs_update_device(trans, device); 1636 return btrfs_update_device(trans, device);
@@ -2007,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2007 goto done; 2017 goto done;
2008 if (ret) { 2018 if (ret) {
2009 ret = 0; 2019 ret = 0;
2010 goto done; 2020 break;
2011 } 2021 }
2012 2022
2013 l = path->nodes[0]; 2023 l = path->nodes[0];
@@ -2015,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2015 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2025 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2016 2026
2017 if (key.objectid != device->devid) 2027 if (key.objectid != device->devid)
2018 goto done; 2028 break;
2019 2029
2020 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2030 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2021 length = btrfs_dev_extent_length(l, dev_extent); 2031 length = btrfs_dev_extent_length(l, dev_extent);
@@ -2171,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2171 max_chunk_size); 2181 max_chunk_size);
2172 2182
2173again: 2183again:
2184 max_avail = 0;
2174 if (!map || map->num_stripes != num_stripes) { 2185 if (!map || map->num_stripes != num_stripes) {
2175 kfree(map); 2186 kfree(map);
2176 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2187 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -2219,7 +2230,8 @@ again:
2219 2230
2220 if (device->in_fs_metadata && avail >= min_free) { 2231 if (device->in_fs_metadata && avail >= min_free) {
2221 ret = find_free_dev_extent(trans, device, 2232 ret = find_free_dev_extent(trans, device,
2222 min_free, &dev_offset); 2233 min_free, &dev_offset,
2234 &max_avail);
2223 if (ret == 0) { 2235 if (ret == 0) {
2224 list_move_tail(&device->dev_alloc_list, 2236 list_move_tail(&device->dev_alloc_list,
2225 &private_devs); 2237 &private_devs);
@@ -2795,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2795 } 2807 }
2796 } 2808 }
2797 2809
2798 for (i = 0; i > nr; i++) {
2799 struct btrfs_multi_bio *multi;
2800 struct btrfs_bio_stripe *stripe;
2801 int ret;
2802
2803 length = 1;
2804 ret = btrfs_map_block(map_tree, WRITE, buf[i],
2805 &length, &multi, 0);
2806 BUG_ON(ret);
2807
2808 stripe = multi->stripes;
2809 for (j = 0; j < multi->num_stripes; j++) {
2810 if (stripe->physical >= physical &&
2811 physical < stripe->physical + length)
2812 break;
2813 }
2814 BUG_ON(j >= multi->num_stripes);
2815 kfree(multi);
2816 }
2817
2818 *logical = buf; 2810 *logical = buf;
2819 *naddrs = nr; 2811 *naddrs = nr;
2820 *stripe_len = map->stripe_len; 2812 *stripe_len = map->stripe_len;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ecfbce836d32..3e2b90eaa239 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -208,7 +208,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
208 *total_in = 0; 208 *total_in = 0;
209 209
210 workspace = find_zlib_workspace(); 210 workspace = find_zlib_workspace();
211 if (!workspace) 211 if (IS_ERR(workspace))
212 return -1; 212 return -1;
213 213
214 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 214 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
@@ -366,7 +366,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
366 char *kaddr; 366 char *kaddr;
367 367
368 workspace = find_zlib_workspace(); 368 workspace = find_zlib_workspace();
369 if (!workspace) 369 if (IS_ERR(workspace))
370 return -ENOMEM; 370 return -ENOMEM;
371 371
372 data_in = kmap(pages_in[page_in_index]); 372 data_in = kmap(pages_in[page_in_index]);
@@ -547,7 +547,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
547 return -ENOMEM; 547 return -ENOMEM;
548 548
549 workspace = find_zlib_workspace(); 549 workspace = find_zlib_workspace();
550 if (!workspace) 550 if (IS_ERR(workspace))
551 return -ENOMEM; 551 return -ENOMEM;
552 552
553 workspace->inf_strm.next_in = data_in; 553 workspace->inf_strm.next_in = data_in;