aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig27
-rw-r--r--fs/btrfs/async-thread.c4
-rw-r--r--fs/btrfs/ctree.c121
-rw-r--r--fs/btrfs/ctree.h29
-rw-r--r--fs/btrfs/disk-io.c15
-rw-r--r--fs/btrfs/extent-tree.c516
-rw-r--r--fs/btrfs/free-space-cache.c1003
-rw-r--r--fs/btrfs/free-space-cache.h8
-rw-r--r--fs/btrfs/inode.c2
-rw-r--r--fs/btrfs/print-tree.c6
-rw-r--r--fs/btrfs/relocation.c3
-rw-r--r--fs/btrfs/transaction.c40
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c46
-rw-r--r--fs/cifs/connect.c8
-rw-r--r--fs/cifs/inode.c9
-rw-r--r--fs/ecryptfs/keystore.c13
-rw-r--r--fs/ext3/dir.c3
-rw-r--r--fs/ext3/inode.c32
-rw-r--r--fs/jbd/journal.c26
-rw-r--r--fs/jbd/transaction.c68
-rw-r--r--fs/jfs/acl.c4
-rw-r--r--fs/nfs/client.c18
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/nfs4_fs.h6
-rw-r--r--fs/nfs/nfs4proc.c40
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nilfs2/Kconfig25
-rw-r--r--fs/notify/Kconfig12
-rw-r--r--fs/notify/dnotify/Kconfig2
-rw-r--r--fs/notify/fsnotify.c4
-rw-r--r--fs/notify/inotify/Kconfig2
-rw-r--r--fs/notify/inotify/inotify_user.c109
-rw-r--r--fs/notify/notification.c19
-rw-r--r--fs/pipe.c4
35 files changed, 1591 insertions, 639 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index a97263be6a91..0e7da7bb5d93 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -186,32 +186,7 @@ source "fs/romfs/Kconfig"
186source "fs/sysv/Kconfig" 186source "fs/sysv/Kconfig"
187source "fs/ufs/Kconfig" 187source "fs/ufs/Kconfig"
188source "fs/exofs/Kconfig" 188source "fs/exofs/Kconfig"
189 189source "fs/nilfs2/Kconfig"
190config NILFS2_FS
191 tristate "NILFS2 file system support (EXPERIMENTAL)"
192 depends on BLOCK && EXPERIMENTAL
193 select CRC32
194 help
195 NILFS2 is a log-structured file system (LFS) supporting continuous
196 snapshotting. In addition to versioning capability of the entire
197 file system, users can even restore files mistakenly overwritten or
198 destroyed just a few seconds ago. Since this file system can keep
199 consistency like conventional LFS, it achieves quick recovery after
200 system crashes.
201
202 NILFS2 creates a number of checkpoints every few seconds or per
203 synchronous write basis (unless there is no change). Users can
204 select significant versions among continuously created checkpoints,
205 and can change them into snapshots which will be preserved for long
206 periods until they are changed back to checkpoints. Each
207 snapshot is mountable as a read-only file system concurrently with
208 its writable mount, and this feature is convenient for online backup.
209
210 Some features including atime, extended attributes, and POSIX ACLs,
211 are not supported yet.
212
213 To compile this file system support as a module, choose M here: the
214 module will be called nilfs2. If unsure, say N.
215 190
216endif # MISC_FILESYSTEMS 191endif # MISC_FILESYSTEMS
217 192
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 6e4f6c50a120..019e8af449ab 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work)
424 * list 424 * list
425 */ 425 */
426 if (worker->idle) { 426 if (worker->idle) {
427 spin_lock_irqsave(&worker->workers->lock, flags); 427 spin_lock(&worker->workers->lock);
428 worker->idle = 0; 428 worker->idle = 0;
429 list_move_tail(&worker->worker_list, 429 list_move_tail(&worker->worker_list,
430 &worker->workers->worker_list); 430 &worker->workers->worker_list);
431 spin_unlock_irqrestore(&worker->workers->lock, flags); 431 spin_unlock(&worker->workers->lock);
432 } 432 }
433 if (!worker->working) { 433 if (!worker->working) {
434 wake = 1; 434 wake = 1;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 60a45f3a4e91..3fdcc0512d3a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
557 557
558 btrfs_disk_key_to_cpu(&k1, disk); 558 btrfs_disk_key_to_cpu(&k1, disk);
559 559
560 if (k1.objectid > k2->objectid) 560 return btrfs_comp_cpu_keys(&k1, k2);
561 return 1;
562 if (k1.objectid < k2->objectid)
563 return -1;
564 if (k1.type > k2->type)
565 return 1;
566 if (k1.type < k2->type)
567 return -1;
568 if (k1.offset > k2->offset)
569 return 1;
570 if (k1.offset < k2->offset)
571 return -1;
572 return 0;
573} 561}
574 562
575/* 563/*
@@ -1052,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1052 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1040 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1053 return 0; 1041 return 0;
1054 1042
1055 if (btrfs_header_nritems(mid) > 2)
1056 return 0;
1057
1058 if (btrfs_header_nritems(mid) < 2) 1043 if (btrfs_header_nritems(mid) < 2)
1059 err_on_enospc = 1; 1044 err_on_enospc = 1;
1060 1045
@@ -1701,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1701 struct extent_buffer *b; 1686 struct extent_buffer *b;
1702 int slot; 1687 int slot;
1703 int ret; 1688 int ret;
1689 int err;
1704 int level; 1690 int level;
1705 int lowest_unlock = 1; 1691 int lowest_unlock = 1;
1706 u8 lowest_level = 0; 1692 u8 lowest_level = 0;
@@ -1737,8 +1723,6 @@ again:
1737 p->locks[level] = 1; 1723 p->locks[level] = 1;
1738 1724
1739 if (cow) { 1725 if (cow) {
1740 int wret;
1741
1742 /* 1726 /*
1743 * if we don't really need to cow this block 1727 * if we don't really need to cow this block
1744 * then we don't want to set the path blocking, 1728 * then we don't want to set the path blocking,
@@ -1749,12 +1733,12 @@ again:
1749 1733
1750 btrfs_set_path_blocking(p); 1734 btrfs_set_path_blocking(p);
1751 1735
1752 wret = btrfs_cow_block(trans, root, b, 1736 err = btrfs_cow_block(trans, root, b,
1753 p->nodes[level + 1], 1737 p->nodes[level + 1],
1754 p->slots[level + 1], &b); 1738 p->slots[level + 1], &b);
1755 if (wret) { 1739 if (err) {
1756 free_extent_buffer(b); 1740 free_extent_buffer(b);
1757 ret = wret; 1741 ret = err;
1758 goto done; 1742 goto done;
1759 } 1743 }
1760 } 1744 }
@@ -1793,41 +1777,45 @@ cow_done:
1793 ret = bin_search(b, key, level, &slot); 1777 ret = bin_search(b, key, level, &slot);
1794 1778
1795 if (level != 0) { 1779 if (level != 0) {
1796 if (ret && slot > 0) 1780 int dec = 0;
1781 if (ret && slot > 0) {
1782 dec = 1;
1797 slot -= 1; 1783 slot -= 1;
1784 }
1798 p->slots[level] = slot; 1785 p->slots[level] = slot;
1799 ret = setup_nodes_for_search(trans, root, p, b, level, 1786 err = setup_nodes_for_search(trans, root, p, b, level,
1800 ins_len); 1787 ins_len);
1801 if (ret == -EAGAIN) 1788 if (err == -EAGAIN)
1802 goto again; 1789 goto again;
1803 else if (ret) 1790 if (err) {
1791 ret = err;
1804 goto done; 1792 goto done;
1793 }
1805 b = p->nodes[level]; 1794 b = p->nodes[level];
1806 slot = p->slots[level]; 1795 slot = p->slots[level];
1807 1796
1808 unlock_up(p, level, lowest_unlock); 1797 unlock_up(p, level, lowest_unlock);
1809 1798
1810 /* this is only true while dropping a snapshot */
1811 if (level == lowest_level) { 1799 if (level == lowest_level) {
1812 ret = 0; 1800 if (dec)
1801 p->slots[level]++;
1813 goto done; 1802 goto done;
1814 } 1803 }
1815 1804
1816 ret = read_block_for_search(trans, root, p, 1805 err = read_block_for_search(trans, root, p,
1817 &b, level, slot, key); 1806 &b, level, slot, key);
1818 if (ret == -EAGAIN) 1807 if (err == -EAGAIN)
1819 goto again; 1808 goto again;
1820 1809 if (err) {
1821 if (ret == -EIO) 1810 ret = err;
1822 goto done; 1811 goto done;
1812 }
1823 1813
1824 if (!p->skip_locking) { 1814 if (!p->skip_locking) {
1825 int lret;
1826
1827 btrfs_clear_path_blocking(p, NULL); 1815 btrfs_clear_path_blocking(p, NULL);
1828 lret = btrfs_try_spin_lock(b); 1816 err = btrfs_try_spin_lock(b);
1829 1817
1830 if (!lret) { 1818 if (!err) {
1831 btrfs_set_path_blocking(p); 1819 btrfs_set_path_blocking(p);
1832 btrfs_tree_lock(b); 1820 btrfs_tree_lock(b);
1833 btrfs_clear_path_blocking(p, b); 1821 btrfs_clear_path_blocking(p, b);
@@ -1837,16 +1825,14 @@ cow_done:
1837 p->slots[level] = slot; 1825 p->slots[level] = slot;
1838 if (ins_len > 0 && 1826 if (ins_len > 0 &&
1839 btrfs_leaf_free_space(root, b) < ins_len) { 1827 btrfs_leaf_free_space(root, b) < ins_len) {
1840 int sret;
1841
1842 btrfs_set_path_blocking(p); 1828 btrfs_set_path_blocking(p);
1843 sret = split_leaf(trans, root, key, 1829 err = split_leaf(trans, root, key,
1844 p, ins_len, ret == 0); 1830 p, ins_len, ret == 0);
1845 btrfs_clear_path_blocking(p, NULL); 1831 btrfs_clear_path_blocking(p, NULL);
1846 1832
1847 BUG_ON(sret > 0); 1833 BUG_ON(err > 0);
1848 if (sret) { 1834 if (err) {
1849 ret = sret; 1835 ret = err;
1850 goto done; 1836 goto done;
1851 } 1837 }
1852 } 1838 }
@@ -3807,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3807 } 3793 }
3808 3794
3809 /* delete the leaf if it is mostly empty */ 3795 /* delete the leaf if it is mostly empty */
3810 if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) { 3796 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
3811 /* push_leaf_left fixes the path. 3797 /* push_leaf_left fixes the path.
3812 * make sure the path still points to our leaf 3798 * make sure the path still points to our leaf
3813 * for possible call to del_ptr below 3799 * for possible call to del_ptr below
@@ -4042,10 +4028,9 @@ out:
4042 * calling this function. 4028 * calling this function.
4043 */ 4029 */
4044int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, 4030int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4045 struct btrfs_key *key, int lowest_level, 4031 struct btrfs_key *key, int level,
4046 int cache_only, u64 min_trans) 4032 int cache_only, u64 min_trans)
4047{ 4033{
4048 int level = lowest_level;
4049 int slot; 4034 int slot;
4050 struct extent_buffer *c; 4035 struct extent_buffer *c;
4051 4036
@@ -4058,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4058 c = path->nodes[level]; 4043 c = path->nodes[level];
4059next: 4044next:
4060 if (slot >= btrfs_header_nritems(c)) { 4045 if (slot >= btrfs_header_nritems(c)) {
4061 level++; 4046 int ret;
4062 if (level == BTRFS_MAX_LEVEL) 4047 int orig_lowest;
4048 struct btrfs_key cur_key;
4049 if (level + 1 >= BTRFS_MAX_LEVEL ||
4050 !path->nodes[level + 1])
4063 return 1; 4051 return 1;
4064 continue; 4052
4053 if (path->locks[level + 1]) {
4054 level++;
4055 continue;
4056 }
4057
4058 slot = btrfs_header_nritems(c) - 1;
4059 if (level == 0)
4060 btrfs_item_key_to_cpu(c, &cur_key, slot);
4061 else
4062 btrfs_node_key_to_cpu(c, &cur_key, slot);
4063
4064 orig_lowest = path->lowest_level;
4065 btrfs_release_path(root, path);
4066 path->lowest_level = level;
4067 ret = btrfs_search_slot(NULL, root, &cur_key, path,
4068 0, 0);
4069 path->lowest_level = orig_lowest;
4070 if (ret < 0)
4071 return ret;
4072
4073 c = path->nodes[level];
4074 slot = path->slots[level];
4075 if (ret == 0)
4076 slot++;
4077 goto next;
4065 } 4078 }
4079
4066 if (level == 0) 4080 if (level == 0)
4067 btrfs_item_key_to_cpu(c, key, slot); 4081 btrfs_item_key_to_cpu(c, key, slot);
4068 else { 4082 else {
@@ -4146,7 +4160,8 @@ again:
4146 * advance the path if there are now more items available. 4160 * advance the path if there are now more items available.
4147 */ 4161 */
4148 if (nritems > 0 && path->slots[0] < nritems - 1) { 4162 if (nritems > 0 && path->slots[0] < nritems - 1) {
4149 path->slots[0]++; 4163 if (ret == 0)
4164 path->slots[0]++;
4150 ret = 0; 4165 ret = 0;
4151 goto done; 4166 goto done;
4152 } 4167 }
@@ -4278,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root,
4278 path->slots[0]--; 4293 path->slots[0]--;
4279 4294
4280 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4295 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4281 if (found_key.type == type)
4282 return 0;
4283 if (found_key.objectid < min_objectid) 4296 if (found_key.objectid < min_objectid)
4284 break; 4297 break;
4298 if (found_key.type == type)
4299 return 0;
4285 if (found_key.objectid == min_objectid && 4300 if (found_key.objectid == min_objectid &&
4286 found_key.type < type) 4301 found_key.type < type)
4287 break; 4302 break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 98a873838717..215ef8cae823 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -481,7 +481,7 @@ struct btrfs_shared_data_ref {
481 481
482struct btrfs_extent_inline_ref { 482struct btrfs_extent_inline_ref {
483 u8 type; 483 u8 type;
484 u64 offset; 484 __le64 offset;
485} __attribute__ ((__packed__)); 485} __attribute__ ((__packed__));
486 486
487/* old style backrefs item */ 487/* old style backrefs item */
@@ -689,6 +689,7 @@ struct btrfs_space_info {
689 struct list_head block_groups; 689 struct list_head block_groups;
690 spinlock_t lock; 690 spinlock_t lock;
691 struct rw_semaphore groups_sem; 691 struct rw_semaphore groups_sem;
692 atomic_t caching_threads;
692}; 693};
693 694
694/* 695/*
@@ -707,6 +708,9 @@ struct btrfs_free_cluster {
707 /* first extent starting offset */ 708 /* first extent starting offset */
708 u64 window_start; 709 u64 window_start;
709 710
711 /* if this cluster simply points at a bitmap in the block group */
712 bool points_to_bitmap;
713
710 struct btrfs_block_group_cache *block_group; 714 struct btrfs_block_group_cache *block_group;
711 /* 715 /*
712 * when a cluster is allocated from a block group, we put the 716 * when a cluster is allocated from a block group, we put the
@@ -716,24 +720,37 @@ struct btrfs_free_cluster {
716 struct list_head block_group_list; 720 struct list_head block_group_list;
717}; 721};
718 722
723enum btrfs_caching_type {
724 BTRFS_CACHE_NO = 0,
725 BTRFS_CACHE_STARTED = 1,
726 BTRFS_CACHE_FINISHED = 2,
727};
728
719struct btrfs_block_group_cache { 729struct btrfs_block_group_cache {
720 struct btrfs_key key; 730 struct btrfs_key key;
721 struct btrfs_block_group_item item; 731 struct btrfs_block_group_item item;
732 struct btrfs_fs_info *fs_info;
722 spinlock_t lock; 733 spinlock_t lock;
723 struct mutex cache_mutex;
724 u64 pinned; 734 u64 pinned;
725 u64 reserved; 735 u64 reserved;
726 u64 flags; 736 u64 flags;
727 int cached; 737 u64 sectorsize;
738 int extents_thresh;
739 int free_extents;
740 int total_bitmaps;
728 int ro; 741 int ro;
729 int dirty; 742 int dirty;
730 743
744 /* cache tracking stuff */
745 wait_queue_head_t caching_q;
746 int cached;
747
731 struct btrfs_space_info *space_info; 748 struct btrfs_space_info *space_info;
732 749
733 /* free space cache stuff */ 750 /* free space cache stuff */
734 spinlock_t tree_lock; 751 spinlock_t tree_lock;
735 struct rb_root free_space_bytes;
736 struct rb_root free_space_offset; 752 struct rb_root free_space_offset;
753 u64 free_space;
737 754
738 /* block group cache stuff */ 755 /* block group cache stuff */
739 struct rb_node cache_node; 756 struct rb_node cache_node;
@@ -942,6 +959,9 @@ struct btrfs_root {
942 /* the node lock is held while changing the node pointer */ 959 /* the node lock is held while changing the node pointer */
943 spinlock_t node_lock; 960 spinlock_t node_lock;
944 961
962 /* taken when updating the commit root */
963 struct rw_semaphore commit_root_sem;
964
945 struct extent_buffer *commit_root; 965 struct extent_buffer *commit_root;
946 struct btrfs_root *log_root; 966 struct btrfs_root *log_root;
947 struct btrfs_root *reloc_root; 967 struct btrfs_root *reloc_root;
@@ -1988,6 +2008,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
1988 u64 bytes); 2008 u64 bytes);
1989void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 2009void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
1990 u64 bytes); 2010 u64 bytes);
2011void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
1991/* ctree.c */ 2012/* ctree.c */
1992int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2013int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1993 int level, int *slot); 2014 int level, int *slot);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d28d29c95f7c..7dcaa8138864 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -909,6 +909,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
909 spin_lock_init(&root->inode_lock); 909 spin_lock_init(&root->inode_lock);
910 mutex_init(&root->objectid_mutex); 910 mutex_init(&root->objectid_mutex);
911 mutex_init(&root->log_mutex); 911 mutex_init(&root->log_mutex);
912 init_rwsem(&root->commit_root_sem);
912 init_waitqueue_head(&root->log_writer_wait); 913 init_waitqueue_head(&root->log_writer_wait);
913 init_waitqueue_head(&root->log_commit_wait[0]); 914 init_waitqueue_head(&root->log_commit_wait[0]);
914 init_waitqueue_head(&root->log_commit_wait[1]); 915 init_waitqueue_head(&root->log_commit_wait[1]);
@@ -1799,6 +1800,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1799 btrfs_super_chunk_root(disk_super), 1800 btrfs_super_chunk_root(disk_super),
1800 blocksize, generation); 1801 blocksize, generation);
1801 BUG_ON(!chunk_root->node); 1802 BUG_ON(!chunk_root->node);
1803 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
1804 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
1805 sb->s_id);
1806 goto fail_chunk_root;
1807 }
1802 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); 1808 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1803 chunk_root->commit_root = btrfs_root_node(chunk_root); 1809 chunk_root->commit_root = btrfs_root_node(chunk_root);
1804 1810
@@ -1826,6 +1832,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1826 blocksize, generation); 1832 blocksize, generation);
1827 if (!tree_root->node) 1833 if (!tree_root->node)
1828 goto fail_chunk_root; 1834 goto fail_chunk_root;
1835 if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1836 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
1837 sb->s_id);
1838 goto fail_tree_root;
1839 }
1829 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 1840 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1830 tree_root->commit_root = btrfs_root_node(tree_root); 1841 tree_root->commit_root = btrfs_root_node(tree_root);
1831 1842
@@ -2322,6 +2333,9 @@ int close_ctree(struct btrfs_root *root)
2322 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2333 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2323 } 2334 }
2324 2335
2336 fs_info->closing = 2;
2337 smp_mb();
2338
2325 if (fs_info->delalloc_bytes) { 2339 if (fs_info->delalloc_bytes) {
2326 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", 2340 printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
2327 (unsigned long long)fs_info->delalloc_bytes); 2341 (unsigned long long)fs_info->delalloc_bytes);
@@ -2343,6 +2357,7 @@ int close_ctree(struct btrfs_root *root)
2343 free_extent_buffer(root->fs_info->csum_root->commit_root); 2357 free_extent_buffer(root->fs_info->csum_root->commit_root);
2344 2358
2345 btrfs_free_block_groups(root->fs_info); 2359 btrfs_free_block_groups(root->fs_info);
2360 btrfs_free_pinned_extents(root->fs_info);
2346 2361
2347 del_fs_roots(fs_info); 2362 del_fs_roots(fs_info);
2348 2363
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a5aca3997d42..fadf69a2764b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -21,6 +21,7 @@
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h>
24#include "compat.h" 25#include "compat.h"
25#include "hash.h" 26#include "hash.h"
26#include "ctree.h" 27#include "ctree.h"
@@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
61 struct btrfs_root *extent_root, u64 alloc_bytes, 62 struct btrfs_root *extent_root, u64 alloc_bytes,
62 u64 flags, int force); 63 u64 flags, int force);
63 64
65static noinline int
66block_group_cache_done(struct btrfs_block_group_cache *cache)
67{
68 smp_mb();
69 return cache->cached == BTRFS_CACHE_FINISHED;
70}
71
64static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 72static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65{ 73{
66 return (cache->flags & bits) == bits; 74 return (cache->flags & bits) == bits;
@@ -146,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
146} 154}
147 155
148/* 156/*
157 * We always set EXTENT_LOCKED for the super mirror extents so we don't
158 * overwrite them, so those bits need to be unset. Also, if we are unmounting
159 * with pinned extents still sitting there because we had a block group caching,
160 * we need to clear those now, since we are done.
161 */
162void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
163{
164 u64 start, end, last = 0;
165 int ret;
166
167 while (1) {
168 ret = find_first_extent_bit(&info->pinned_extents, last,
169 &start, &end,
170 EXTENT_LOCKED|EXTENT_DIRTY);
171 if (ret)
172 break;
173
174 clear_extent_bits(&info->pinned_extents, start, end,
175 EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
176 last = end+1;
177 }
178}
179
180static int remove_sb_from_cache(struct btrfs_root *root,
181 struct btrfs_block_group_cache *cache)
182{
183 struct btrfs_fs_info *fs_info = root->fs_info;
184 u64 bytenr;
185 u64 *logical;
186 int stripe_len;
187 int i, nr, ret;
188
189 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
190 bytenr = btrfs_sb_offset(i);
191 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
192 cache->key.objectid, bytenr,
193 0, &logical, &nr, &stripe_len);
194 BUG_ON(ret);
195 while (nr--) {
196 try_lock_extent(&fs_info->pinned_extents,
197 logical[nr],
198 logical[nr] + stripe_len - 1, GFP_NOFS);
199 }
200 kfree(logical);
201 }
202
203 return 0;
204}
205
206/*
149 * this is only called by cache_block_group, since we could have freed extents 207 * this is only called by cache_block_group, since we could have freed extents
150 * we need to check the pinned_extents for any extents that can't be used yet 208 * we need to check the pinned_extents for any extents that can't be used yet
151 * since their free space will be released as soon as the transaction commits. 209 * since their free space will be released as soon as the transaction commits.
152 */ 210 */
153static int add_new_free_space(struct btrfs_block_group_cache *block_group, 211static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
154 struct btrfs_fs_info *info, u64 start, u64 end) 212 struct btrfs_fs_info *info, u64 start, u64 end)
155{ 213{
156 u64 extent_start, extent_end, size; 214 u64 extent_start, extent_end, size, total_added = 0;
157 int ret; 215 int ret;
158 216
159 while (start < end) { 217 while (start < end) {
160 ret = find_first_extent_bit(&info->pinned_extents, start, 218 ret = find_first_extent_bit(&info->pinned_extents, start,
161 &extent_start, &extent_end, 219 &extent_start, &extent_end,
162 EXTENT_DIRTY); 220 EXTENT_DIRTY|EXTENT_LOCKED);
163 if (ret) 221 if (ret)
164 break; 222 break;
165 223
@@ -167,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
167 start = extent_end + 1; 225 start = extent_end + 1;
168 } else if (extent_start > start && extent_start < end) { 226 } else if (extent_start > start && extent_start < end) {
169 size = extent_start - start; 227 size = extent_start - start;
228 total_added += size;
170 ret = btrfs_add_free_space(block_group, start, 229 ret = btrfs_add_free_space(block_group, start,
171 size); 230 size);
172 BUG_ON(ret); 231 BUG_ON(ret);
@@ -178,84 +237,79 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
178 237
179 if (start < end) { 238 if (start < end) {
180 size = end - start; 239 size = end - start;
240 total_added += size;
181 ret = btrfs_add_free_space(block_group, start, size); 241 ret = btrfs_add_free_space(block_group, start, size);
182 BUG_ON(ret); 242 BUG_ON(ret);
183 } 243 }
184 244
185 return 0; 245 return total_added;
186} 246}
187 247
188static int remove_sb_from_cache(struct btrfs_root *root, 248static int caching_kthread(void *data)
189 struct btrfs_block_group_cache *cache)
190{
191 u64 bytenr;
192 u64 *logical;
193 int stripe_len;
194 int i, nr, ret;
195
196 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
197 bytenr = btrfs_sb_offset(i);
198 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
199 cache->key.objectid, bytenr, 0,
200 &logical, &nr, &stripe_len);
201 BUG_ON(ret);
202 while (nr--) {
203 btrfs_remove_free_space(cache, logical[nr],
204 stripe_len);
205 }
206 kfree(logical);
207 }
208 return 0;
209}
210
211static int cache_block_group(struct btrfs_root *root,
212 struct btrfs_block_group_cache *block_group)
213{ 249{
250 struct btrfs_block_group_cache *block_group = data;
251 struct btrfs_fs_info *fs_info = block_group->fs_info;
252 u64 last = 0;
214 struct btrfs_path *path; 253 struct btrfs_path *path;
215 int ret = 0; 254 int ret = 0;
216 struct btrfs_key key; 255 struct btrfs_key key;
217 struct extent_buffer *leaf; 256 struct extent_buffer *leaf;
218 int slot; 257 int slot;
219 u64 last; 258 u64 total_found = 0;
220
221 if (!block_group)
222 return 0;
223 259
224 root = root->fs_info->extent_root; 260 BUG_ON(!fs_info);
225
226 if (block_group->cached)
227 return 0;
228 261
229 path = btrfs_alloc_path(); 262 path = btrfs_alloc_path();
230 if (!path) 263 if (!path)
231 return -ENOMEM; 264 return -ENOMEM;
232 265
233 path->reada = 2; 266 atomic_inc(&block_group->space_info->caching_threads);
267 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
268again:
269 /* need to make sure the commit_root doesn't disappear */
270 down_read(&fs_info->extent_root->commit_root_sem);
271
234 /* 272 /*
235 * we get into deadlocks with paths held by callers of this function. 273 * We don't want to deadlock with somebody trying to allocate a new
236 * since the alloc_mutex is protecting things right now, just 274 * extent for the extent root while also trying to search the extent
237 * skip the locking here 275 * root to add free space. So we skip locking and search the commit
276 * root, since its read-only
238 */ 277 */
239 path->skip_locking = 1; 278 path->skip_locking = 1;
240 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 279 path->search_commit_root = 1;
280 path->reada = 2;
281
241 key.objectid = last; 282 key.objectid = last;
242 key.offset = 0; 283 key.offset = 0;
243 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 284 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
244 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 285 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
245 if (ret < 0) 286 if (ret < 0)
246 goto err; 287 goto err;
247 288
248 while (1) { 289 while (1) {
290 smp_mb();
291 if (block_group->fs_info->closing > 1) {
292 last = (u64)-1;
293 break;
294 }
295
249 leaf = path->nodes[0]; 296 leaf = path->nodes[0];
250 slot = path->slots[0]; 297 slot = path->slots[0];
251 if (slot >= btrfs_header_nritems(leaf)) { 298 if (slot >= btrfs_header_nritems(leaf)) {
252 ret = btrfs_next_leaf(root, path); 299 ret = btrfs_next_leaf(fs_info->extent_root, path);
253 if (ret < 0) 300 if (ret < 0)
254 goto err; 301 goto err;
255 if (ret == 0) 302 else if (ret)
256 continue;
257 else
258 break; 303 break;
304
305 if (need_resched()) {
306 btrfs_release_path(fs_info->extent_root, path);
307 up_read(&fs_info->extent_root->commit_root_sem);
308 cond_resched();
309 goto again;
310 }
311
312 continue;
259 } 313 }
260 btrfs_item_key_to_cpu(leaf, &key, slot); 314 btrfs_item_key_to_cpu(leaf, &key, slot);
261 if (key.objectid < block_group->key.objectid) 315 if (key.objectid < block_group->key.objectid)
@@ -266,24 +320,59 @@ static int cache_block_group(struct btrfs_root *root,
266 break; 320 break;
267 321
268 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { 322 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
269 add_new_free_space(block_group, root->fs_info, last, 323 total_found += add_new_free_space(block_group,
270 key.objectid); 324 fs_info, last,
271 325 key.objectid);
272 last = key.objectid + key.offset; 326 last = key.objectid + key.offset;
273 } 327 }
328
329 if (total_found > (1024 * 1024 * 2)) {
330 total_found = 0;
331 wake_up(&block_group->caching_q);
332 }
274next: 333next:
275 path->slots[0]++; 334 path->slots[0]++;
276 } 335 }
336 ret = 0;
277 337
278 add_new_free_space(block_group, root->fs_info, last, 338 total_found += add_new_free_space(block_group, fs_info, last,
279 block_group->key.objectid + 339 block_group->key.objectid +
280 block_group->key.offset); 340 block_group->key.offset);
341
342 spin_lock(&block_group->lock);
343 block_group->cached = BTRFS_CACHE_FINISHED;
344 spin_unlock(&block_group->lock);
281 345
282 block_group->cached = 1;
283 remove_sb_from_cache(root, block_group);
284 ret = 0;
285err: 346err:
286 btrfs_free_path(path); 347 btrfs_free_path(path);
348 up_read(&fs_info->extent_root->commit_root_sem);
349 atomic_dec(&block_group->space_info->caching_threads);
350 wake_up(&block_group->caching_q);
351
352 return 0;
353}
354
355static int cache_block_group(struct btrfs_block_group_cache *cache)
356{
357 struct task_struct *tsk;
358 int ret = 0;
359
360 spin_lock(&cache->lock);
361 if (cache->cached != BTRFS_CACHE_NO) {
362 spin_unlock(&cache->lock);
363 return ret;
364 }
365 cache->cached = BTRFS_CACHE_STARTED;
366 spin_unlock(&cache->lock);
367
368 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
369 cache->key.objectid);
370 if (IS_ERR(tsk)) {
371 ret = PTR_ERR(tsk);
372 printk(KERN_ERR "error running thread %d\n", ret);
373 BUG();
374 }
375
287 return ret; 376 return ret;
288} 377}
289 378
@@ -2387,13 +2476,29 @@ fail:
2387 2476
2388} 2477}
2389 2478
2479static struct btrfs_block_group_cache *
2480next_block_group(struct btrfs_root *root,
2481 struct btrfs_block_group_cache *cache)
2482{
2483 struct rb_node *node;
2484 spin_lock(&root->fs_info->block_group_cache_lock);
2485 node = rb_next(&cache->cache_node);
2486 btrfs_put_block_group(cache);
2487 if (node) {
2488 cache = rb_entry(node, struct btrfs_block_group_cache,
2489 cache_node);
2490 atomic_inc(&cache->count);
2491 } else
2492 cache = NULL;
2493 spin_unlock(&root->fs_info->block_group_cache_lock);
2494 return cache;
2495}
2496
2390int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2497int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2391 struct btrfs_root *root) 2498 struct btrfs_root *root)
2392{ 2499{
2393 struct btrfs_block_group_cache *cache, *entry; 2500 struct btrfs_block_group_cache *cache;
2394 struct rb_node *n;
2395 int err = 0; 2501 int err = 0;
2396 int werr = 0;
2397 struct btrfs_path *path; 2502 struct btrfs_path *path;
2398 u64 last = 0; 2503 u64 last = 0;
2399 2504
@@ -2402,39 +2507,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2402 return -ENOMEM; 2507 return -ENOMEM;
2403 2508
2404 while (1) { 2509 while (1) {
2405 cache = NULL; 2510 if (last == 0) {
2406 spin_lock(&root->fs_info->block_group_cache_lock); 2511 err = btrfs_run_delayed_refs(trans, root,
2407 for (n = rb_first(&root->fs_info->block_group_cache_tree); 2512 (unsigned long)-1);
2408 n; n = rb_next(n)) { 2513 BUG_ON(err);
2409 entry = rb_entry(n, struct btrfs_block_group_cache,
2410 cache_node);
2411 if (entry->dirty) {
2412 cache = entry;
2413 break;
2414 }
2415 } 2514 }
2416 spin_unlock(&root->fs_info->block_group_cache_lock);
2417 2515
2418 if (!cache) 2516 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2419 break; 2517 while (cache) {
2518 if (cache->dirty)
2519 break;
2520 cache = next_block_group(root, cache);
2521 }
2522 if (!cache) {
2523 if (last == 0)
2524 break;
2525 last = 0;
2526 continue;
2527 }
2420 2528
2421 cache->dirty = 0; 2529 cache->dirty = 0;
2422 last += cache->key.offset; 2530 last = cache->key.objectid + cache->key.offset;
2423 2531
2424 err = write_one_cache_group(trans, root, 2532 err = write_one_cache_group(trans, root, path, cache);
2425 path, cache); 2533 BUG_ON(err);
2426 /* 2534 btrfs_put_block_group(cache);
2427 * if we fail to write the cache group, we want
2428 * to keep it marked dirty in hopes that a later
2429 * write will work
2430 */
2431 if (err) {
2432 werr = err;
2433 continue;
2434 }
2435 } 2535 }
2536
2436 btrfs_free_path(path); 2537 btrfs_free_path(path);
2437 return werr; 2538 return 0;
2438} 2539}
2439 2540
2440int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 2541int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -2484,6 +2585,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2484 found->force_alloc = 0; 2585 found->force_alloc = 0;
2485 *space_info = found; 2586 *space_info = found;
2486 list_add_rcu(&found->list, &info->space_info); 2587 list_add_rcu(&found->list, &info->space_info);
2588 atomic_set(&found->caching_threads, 0);
2487 return 0; 2589 return 0;
2488} 2590}
2489 2591
@@ -2947,13 +3049,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2947 struct btrfs_block_group_cache *cache; 3049 struct btrfs_block_group_cache *cache;
2948 struct btrfs_fs_info *fs_info = root->fs_info; 3050 struct btrfs_fs_info *fs_info = root->fs_info;
2949 3051
2950 if (pin) { 3052 if (pin)
2951 set_extent_dirty(&fs_info->pinned_extents, 3053 set_extent_dirty(&fs_info->pinned_extents,
2952 bytenr, bytenr + num - 1, GFP_NOFS); 3054 bytenr, bytenr + num - 1, GFP_NOFS);
2953 } else {
2954 clear_extent_dirty(&fs_info->pinned_extents,
2955 bytenr, bytenr + num - 1, GFP_NOFS);
2956 }
2957 3055
2958 while (num > 0) { 3056 while (num > 0) {
2959 cache = btrfs_lookup_block_group(fs_info, bytenr); 3057 cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2969,14 +3067,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2969 spin_unlock(&cache->space_info->lock); 3067 spin_unlock(&cache->space_info->lock);
2970 fs_info->total_pinned += len; 3068 fs_info->total_pinned += len;
2971 } else { 3069 } else {
3070 int unpin = 0;
3071
3072 /*
3073 * in order to not race with the block group caching, we
3074 * only want to unpin the extent if we are cached. If
3075 * we aren't cached, we want to start async caching this
3076 * block group so we can free the extent the next time
3077 * around.
3078 */
2972 spin_lock(&cache->space_info->lock); 3079 spin_lock(&cache->space_info->lock);
2973 spin_lock(&cache->lock); 3080 spin_lock(&cache->lock);
2974 cache->pinned -= len; 3081 unpin = (cache->cached == BTRFS_CACHE_FINISHED);
2975 cache->space_info->bytes_pinned -= len; 3082 if (likely(unpin)) {
3083 cache->pinned -= len;
3084 cache->space_info->bytes_pinned -= len;
3085 fs_info->total_pinned -= len;
3086 }
2976 spin_unlock(&cache->lock); 3087 spin_unlock(&cache->lock);
2977 spin_unlock(&cache->space_info->lock); 3088 spin_unlock(&cache->space_info->lock);
2978 fs_info->total_pinned -= len; 3089
2979 if (cache->cached) 3090 if (likely(unpin))
3091 clear_extent_dirty(&fs_info->pinned_extents,
3092 bytenr, bytenr + len -1,
3093 GFP_NOFS);
3094 else
3095 cache_block_group(cache);
3096
3097 if (unpin)
2980 btrfs_add_free_space(cache, bytenr, len); 3098 btrfs_add_free_space(cache, bytenr, len);
2981 } 3099 }
2982 btrfs_put_block_group(cache); 3100 btrfs_put_block_group(cache);
@@ -3030,6 +3148,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
3030 &start, &end, EXTENT_DIRTY); 3148 &start, &end, EXTENT_DIRTY);
3031 if (ret) 3149 if (ret)
3032 break; 3150 break;
3151
3033 set_extent_dirty(copy, start, end, GFP_NOFS); 3152 set_extent_dirty(copy, start, end, GFP_NOFS);
3034 last = end + 1; 3153 last = end + 1;
3035 } 3154 }
@@ -3058,6 +3177,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3058 3177
3059 cond_resched(); 3178 cond_resched();
3060 } 3179 }
3180
3061 return ret; 3181 return ret;
3062} 3182}
3063 3183
@@ -3436,6 +3556,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
3436} 3556}
3437 3557
3438/* 3558/*
3559 * when we wait for progress in the block group caching, its because
3560 * our allocation attempt failed at least once. So, we must sleep
3561 * and let some progress happen before we try again.
3562 *
3563 * This function will sleep at least once waiting for new free space to
3564 * show up, and then it will check the block group free space numbers
3565 * for our min num_bytes. Another option is to have it go ahead
3566 * and look in the rbtree for a free extent of a given size, but this
3567 * is a good start.
3568 */
3569static noinline int
3570wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
3571 u64 num_bytes)
3572{
3573 DEFINE_WAIT(wait);
3574
3575 prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
3576
3577 if (block_group_cache_done(cache)) {
3578 finish_wait(&cache->caching_q, &wait);
3579 return 0;
3580 }
3581 schedule();
3582 finish_wait(&cache->caching_q, &wait);
3583
3584 wait_event(cache->caching_q, block_group_cache_done(cache) ||
3585 (cache->free_space >= num_bytes));
3586 return 0;
3587}
3588
3589enum btrfs_loop_type {
3590 LOOP_CACHED_ONLY = 0,
3591 LOOP_CACHING_NOWAIT = 1,
3592 LOOP_CACHING_WAIT = 2,
3593 LOOP_ALLOC_CHUNK = 3,
3594 LOOP_NO_EMPTY_SIZE = 4,
3595};
3596
3597/*
3439 * walks the btree of allocated extents and find a hole of a given size. 3598 * walks the btree of allocated extents and find a hole of a given size.
3440 * The key ins is changed to record the hole: 3599 * The key ins is changed to record the hole:
3441 * ins->objectid == block start 3600 * ins->objectid == block start
@@ -3460,6 +3619,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3460 struct btrfs_space_info *space_info; 3619 struct btrfs_space_info *space_info;
3461 int last_ptr_loop = 0; 3620 int last_ptr_loop = 0;
3462 int loop = 0; 3621 int loop = 0;
3622 bool found_uncached_bg = false;
3463 3623
3464 WARN_ON(num_bytes < root->sectorsize); 3624 WARN_ON(num_bytes < root->sectorsize);
3465 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 3625 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3491,15 +3651,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3491 search_start = max(search_start, first_logical_byte(root, 0)); 3651 search_start = max(search_start, first_logical_byte(root, 0));
3492 search_start = max(search_start, hint_byte); 3652 search_start = max(search_start, hint_byte);
3493 3653
3494 if (!last_ptr) { 3654 if (!last_ptr)
3495 empty_cluster = 0; 3655 empty_cluster = 0;
3496 loop = 1;
3497 }
3498 3656
3499 if (search_start == hint_byte) { 3657 if (search_start == hint_byte) {
3500 block_group = btrfs_lookup_block_group(root->fs_info, 3658 block_group = btrfs_lookup_block_group(root->fs_info,
3501 search_start); 3659 search_start);
3502 if (block_group && block_group_bits(block_group, data)) { 3660 /*
3661 * we don't want to use the block group if it doesn't match our
3662 * allocation bits, or if its not cached.
3663 */
3664 if (block_group && block_group_bits(block_group, data) &&
3665 block_group_cache_done(block_group)) {
3503 down_read(&space_info->groups_sem); 3666 down_read(&space_info->groups_sem);
3504 if (list_empty(&block_group->list) || 3667 if (list_empty(&block_group->list) ||
3505 block_group->ro) { 3668 block_group->ro) {
@@ -3522,21 +3685,35 @@ search:
3522 down_read(&space_info->groups_sem); 3685 down_read(&space_info->groups_sem);
3523 list_for_each_entry(block_group, &space_info->block_groups, list) { 3686 list_for_each_entry(block_group, &space_info->block_groups, list) {
3524 u64 offset; 3687 u64 offset;
3688 int cached;
3525 3689
3526 atomic_inc(&block_group->count); 3690 atomic_inc(&block_group->count);
3527 search_start = block_group->key.objectid; 3691 search_start = block_group->key.objectid;
3528 3692
3529have_block_group: 3693have_block_group:
3530 if (unlikely(!block_group->cached)) { 3694 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
3531 mutex_lock(&block_group->cache_mutex); 3695 /*
3532 ret = cache_block_group(root, block_group); 3696 * we want to start caching kthreads, but not too many
3533 mutex_unlock(&block_group->cache_mutex); 3697 * right off the bat so we don't overwhelm the system,
3534 if (ret) { 3698 * so only start them if there are less than 2 and we're
3535 btrfs_put_block_group(block_group); 3699 * in the initial allocation phase.
3536 break; 3700 */
3701 if (loop > LOOP_CACHING_NOWAIT ||
3702 atomic_read(&space_info->caching_threads) < 2) {
3703 ret = cache_block_group(block_group);
3704 BUG_ON(ret);
3537 } 3705 }
3538 } 3706 }
3539 3707
3708 cached = block_group_cache_done(block_group);
3709 if (unlikely(!cached)) {
3710 found_uncached_bg = true;
3711
3712 /* if we only want cached bgs, loop */
3713 if (loop == LOOP_CACHED_ONLY)
3714 goto loop;
3715 }
3716
3540 if (unlikely(block_group->ro)) 3717 if (unlikely(block_group->ro))
3541 goto loop; 3718 goto loop;
3542 3719
@@ -3615,14 +3792,21 @@ refill_cluster:
3615 spin_unlock(&last_ptr->refill_lock); 3792 spin_unlock(&last_ptr->refill_lock);
3616 goto checks; 3793 goto checks;
3617 } 3794 }
3795 } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
3796 spin_unlock(&last_ptr->refill_lock);
3797
3798 wait_block_group_cache_progress(block_group,
3799 num_bytes + empty_cluster + empty_size);
3800 goto have_block_group;
3618 } 3801 }
3802
3619 /* 3803 /*
3620 * at this point we either didn't find a cluster 3804 * at this point we either didn't find a cluster
3621 * or we weren't able to allocate a block from our 3805 * or we weren't able to allocate a block from our
3622 * cluster. Free the cluster we've been trying 3806 * cluster. Free the cluster we've been trying
3623 * to use, and go to the next block group 3807 * to use, and go to the next block group
3624 */ 3808 */
3625 if (loop < 2) { 3809 if (loop < LOOP_NO_EMPTY_SIZE) {
3626 btrfs_return_cluster_to_free_space(NULL, 3810 btrfs_return_cluster_to_free_space(NULL,
3627 last_ptr); 3811 last_ptr);
3628 spin_unlock(&last_ptr->refill_lock); 3812 spin_unlock(&last_ptr->refill_lock);
@@ -3633,11 +3817,17 @@ refill_cluster:
3633 3817
3634 offset = btrfs_find_space_for_alloc(block_group, search_start, 3818 offset = btrfs_find_space_for_alloc(block_group, search_start,
3635 num_bytes, empty_size); 3819 num_bytes, empty_size);
3636 if (!offset) 3820 if (!offset && (cached || (!cached &&
3821 loop == LOOP_CACHING_NOWAIT))) {
3637 goto loop; 3822 goto loop;
3823 } else if (!offset && (!cached &&
3824 loop > LOOP_CACHING_NOWAIT)) {
3825 wait_block_group_cache_progress(block_group,
3826 num_bytes + empty_size);
3827 goto have_block_group;
3828 }
3638checks: 3829checks:
3639 search_start = stripe_align(root, offset); 3830 search_start = stripe_align(root, offset);
3640
3641 /* move on to the next group */ 3831 /* move on to the next group */
3642 if (search_start + num_bytes >= search_end) { 3832 if (search_start + num_bytes >= search_end) {
3643 btrfs_add_free_space(block_group, offset, num_bytes); 3833 btrfs_add_free_space(block_group, offset, num_bytes);
@@ -3683,13 +3873,26 @@ loop:
3683 } 3873 }
3684 up_read(&space_info->groups_sem); 3874 up_read(&space_info->groups_sem);
3685 3875
3686 /* loop == 0, try to find a clustered alloc in every block group 3876 /* LOOP_CACHED_ONLY, only search fully cached block groups
3687 * loop == 1, try again after forcing a chunk allocation 3877 * LOOP_CACHING_NOWAIT, search partially cached block groups, but
3688 * loop == 2, set empty_size and empty_cluster to 0 and try again 3878 * dont wait foR them to finish caching
3879 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3880 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3881 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
3882 * again
3689 */ 3883 */
3690 if (!ins->objectid && loop < 3 && 3884 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
3691 (empty_size || empty_cluster || allowed_chunk_alloc)) { 3885 (found_uncached_bg || empty_size || empty_cluster ||
3692 if (loop >= 2) { 3886 allowed_chunk_alloc)) {
3887 if (found_uncached_bg) {
3888 found_uncached_bg = false;
3889 if (loop < LOOP_CACHING_WAIT) {
3890 loop++;
3891 goto search;
3892 }
3893 }
3894
3895 if (loop == LOOP_ALLOC_CHUNK) {
3693 empty_size = 0; 3896 empty_size = 0;
3694 empty_cluster = 0; 3897 empty_cluster = 0;
3695 } 3898 }
@@ -3702,7 +3905,7 @@ loop:
3702 space_info->force_alloc = 1; 3905 space_info->force_alloc = 1;
3703 } 3906 }
3704 3907
3705 if (loop < 3) { 3908 if (loop < LOOP_NO_EMPTY_SIZE) {
3706 loop++; 3909 loop++;
3707 goto search; 3910 goto search;
3708 } 3911 }
@@ -3798,7 +4001,7 @@ again:
3798 num_bytes, data, 1); 4001 num_bytes, data, 1);
3799 goto again; 4002 goto again;
3800 } 4003 }
3801 if (ret) { 4004 if (ret == -ENOSPC) {
3802 struct btrfs_space_info *sinfo; 4005 struct btrfs_space_info *sinfo;
3803 4006
3804 sinfo = __find_space_info(root->fs_info, data); 4007 sinfo = __find_space_info(root->fs_info, data);
@@ -3806,7 +4009,6 @@ again:
3806 "wanted %llu\n", (unsigned long long)data, 4009 "wanted %llu\n", (unsigned long long)data,
3807 (unsigned long long)num_bytes); 4010 (unsigned long long)num_bytes);
3808 dump_space_info(sinfo, num_bytes); 4011 dump_space_info(sinfo, num_bytes);
3809 BUG();
3810 } 4012 }
3811 4013
3812 return ret; 4014 return ret;
@@ -3844,7 +4046,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3844 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, 4046 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3845 empty_size, hint_byte, search_end, ins, 4047 empty_size, hint_byte, search_end, ins,
3846 data); 4048 data);
3847 update_reserved_extents(root, ins->objectid, ins->offset, 1); 4049 if (!ret)
4050 update_reserved_extents(root, ins->objectid, ins->offset, 1);
4051
3848 return ret; 4052 return ret;
3849} 4053}
3850 4054
@@ -4006,9 +4210,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4006 struct btrfs_block_group_cache *block_group; 4210 struct btrfs_block_group_cache *block_group;
4007 4211
4008 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 4212 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
4009 mutex_lock(&block_group->cache_mutex); 4213 cache_block_group(block_group);
4010 cache_block_group(root, block_group); 4214 wait_event(block_group->caching_q,
4011 mutex_unlock(&block_group->cache_mutex); 4215 block_group_cache_done(block_group));
4012 4216
4013 ret = btrfs_remove_free_space(block_group, ins->objectid, 4217 ret = btrfs_remove_free_space(block_group, ins->objectid,
4014 ins->offset); 4218 ins->offset);
@@ -4039,7 +4243,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
4039 ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, 4243 ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4040 empty_size, hint_byte, search_end, 4244 empty_size, hint_byte, search_end,
4041 ins, 0); 4245 ins, 0);
4042 BUG_ON(ret); 4246 if (ret)
4247 return ret;
4043 4248
4044 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 4249 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4045 if (parent == 0) 4250 if (parent == 0)
@@ -6955,11 +7160,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6955 &info->block_group_cache_tree); 7160 &info->block_group_cache_tree);
6956 spin_unlock(&info->block_group_cache_lock); 7161 spin_unlock(&info->block_group_cache_lock);
6957 7162
6958 btrfs_remove_free_space_cache(block_group);
6959 down_write(&block_group->space_info->groups_sem); 7163 down_write(&block_group->space_info->groups_sem);
6960 list_del(&block_group->list); 7164 list_del(&block_group->list);
6961 up_write(&block_group->space_info->groups_sem); 7165 up_write(&block_group->space_info->groups_sem);
6962 7166
7167 if (block_group->cached == BTRFS_CACHE_STARTED)
7168 wait_event(block_group->caching_q,
7169 block_group_cache_done(block_group));
7170
7171 btrfs_remove_free_space_cache(block_group);
7172
6963 WARN_ON(atomic_read(&block_group->count) != 1); 7173 WARN_ON(atomic_read(&block_group->count) != 1);
6964 kfree(block_group); 7174 kfree(block_group);
6965 7175
@@ -7025,9 +7235,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7025 atomic_set(&cache->count, 1); 7235 atomic_set(&cache->count, 1);
7026 spin_lock_init(&cache->lock); 7236 spin_lock_init(&cache->lock);
7027 spin_lock_init(&cache->tree_lock); 7237 spin_lock_init(&cache->tree_lock);
7028 mutex_init(&cache->cache_mutex); 7238 cache->fs_info = info;
7239 init_waitqueue_head(&cache->caching_q);
7029 INIT_LIST_HEAD(&cache->list); 7240 INIT_LIST_HEAD(&cache->list);
7030 INIT_LIST_HEAD(&cache->cluster_list); 7241 INIT_LIST_HEAD(&cache->cluster_list);
7242
7243 /*
7244 * we only want to have 32k of ram per block group for keeping
7245 * track of free space, and if we pass 1/2 of that we want to
7246 * start converting things over to using bitmaps
7247 */
7248 cache->extents_thresh = ((1024 * 32) / 2) /
7249 sizeof(struct btrfs_free_space);
7250
7031 read_extent_buffer(leaf, &cache->item, 7251 read_extent_buffer(leaf, &cache->item,
7032 btrfs_item_ptr_offset(leaf, path->slots[0]), 7252 btrfs_item_ptr_offset(leaf, path->slots[0]),
7033 sizeof(cache->item)); 7253 sizeof(cache->item));
@@ -7036,6 +7256,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7036 key.objectid = found_key.objectid + found_key.offset; 7256 key.objectid = found_key.objectid + found_key.offset;
7037 btrfs_release_path(root, path); 7257 btrfs_release_path(root, path);
7038 cache->flags = btrfs_block_group_flags(&cache->item); 7258 cache->flags = btrfs_block_group_flags(&cache->item);
7259 cache->sectorsize = root->sectorsize;
7260
7261 remove_sb_from_cache(root, cache);
7262
7263 /*
7264 * check for two cases, either we are full, and therefore
7265 * don't need to bother with the caching work since we won't
7266 * find any space, or we are empty, and we can just add all
7267 * the space in and be done with it. This saves us _alot_ of
7268 * time, particularly in the full case.
7269 */
7270 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7271 cache->cached = BTRFS_CACHE_FINISHED;
7272 } else if (btrfs_block_group_used(&cache->item) == 0) {
7273 cache->cached = BTRFS_CACHE_FINISHED;
7274 add_new_free_space(cache, root->fs_info,
7275 found_key.objectid,
7276 found_key.objectid +
7277 found_key.offset);
7278 }
7039 7279
7040 ret = update_space_info(info, cache->flags, found_key.offset, 7280 ret = update_space_info(info, cache->flags, found_key.offset,
7041 btrfs_block_group_used(&cache->item), 7281 btrfs_block_group_used(&cache->item),
@@ -7079,10 +7319,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7079 cache->key.objectid = chunk_offset; 7319 cache->key.objectid = chunk_offset;
7080 cache->key.offset = size; 7320 cache->key.offset = size;
7081 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 7321 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7322 cache->sectorsize = root->sectorsize;
7323
7324 /*
7325 * we only want to have 32k of ram per block group for keeping track
7326 * of free space, and if we pass 1/2 of that we want to start
7327 * converting things over to using bitmaps
7328 */
7329 cache->extents_thresh = ((1024 * 32) / 2) /
7330 sizeof(struct btrfs_free_space);
7082 atomic_set(&cache->count, 1); 7331 atomic_set(&cache->count, 1);
7083 spin_lock_init(&cache->lock); 7332 spin_lock_init(&cache->lock);
7084 spin_lock_init(&cache->tree_lock); 7333 spin_lock_init(&cache->tree_lock);
7085 mutex_init(&cache->cache_mutex); 7334 init_waitqueue_head(&cache->caching_q);
7086 INIT_LIST_HEAD(&cache->list); 7335 INIT_LIST_HEAD(&cache->list);
7087 INIT_LIST_HEAD(&cache->cluster_list); 7336 INIT_LIST_HEAD(&cache->cluster_list);
7088 7337
@@ -7091,6 +7340,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7091 cache->flags = type; 7340 cache->flags = type;
7092 btrfs_set_block_group_flags(&cache->item, type); 7341 btrfs_set_block_group_flags(&cache->item, type);
7093 7342
7343 cache->cached = BTRFS_CACHE_FINISHED;
7344 remove_sb_from_cache(root, cache);
7345
7346 add_new_free_space(cache, root->fs_info, chunk_offset,
7347 chunk_offset + size);
7348
7094 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7349 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7095 &cache->space_info); 7350 &cache->space_info);
7096 BUG_ON(ret); 7351 BUG_ON(ret);
@@ -7149,7 +7404,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7149 rb_erase(&block_group->cache_node, 7404 rb_erase(&block_group->cache_node,
7150 &root->fs_info->block_group_cache_tree); 7405 &root->fs_info->block_group_cache_tree);
7151 spin_unlock(&root->fs_info->block_group_cache_lock); 7406 spin_unlock(&root->fs_info->block_group_cache_lock);
7152 btrfs_remove_free_space_cache(block_group); 7407
7153 down_write(&block_group->space_info->groups_sem); 7408 down_write(&block_group->space_info->groups_sem);
7154 /* 7409 /*
7155 * we must use list_del_init so people can check to see if they 7410 * we must use list_del_init so people can check to see if they
@@ -7158,11 +7413,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7158 list_del_init(&block_group->list); 7413 list_del_init(&block_group->list);
7159 up_write(&block_group->space_info->groups_sem); 7414 up_write(&block_group->space_info->groups_sem);
7160 7415
7416 if (block_group->cached == BTRFS_CACHE_STARTED)
7417 wait_event(block_group->caching_q,
7418 block_group_cache_done(block_group));
7419
7420 btrfs_remove_free_space_cache(block_group);
7421
7161 spin_lock(&block_group->space_info->lock); 7422 spin_lock(&block_group->space_info->lock);
7162 block_group->space_info->total_bytes -= block_group->key.offset; 7423 block_group->space_info->total_bytes -= block_group->key.offset;
7163 block_group->space_info->bytes_readonly -= block_group->key.offset; 7424 block_group->space_info->bytes_readonly -= block_group->key.offset;
7164 spin_unlock(&block_group->space_info->lock); 7425 spin_unlock(&block_group->space_info->lock);
7165 block_group->space_info->full = 0; 7426
7427 btrfs_clear_space_info_full(root->fs_info);
7166 7428
7167 btrfs_put_block_group(block_group); 7429 btrfs_put_block_group(block_group);
7168 btrfs_put_block_group(block_group); 7430 btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4538e48581a5..af99b78b288e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,45 +16,46 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/pagemap.h>
19#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/math64.h>
20#include "ctree.h" 22#include "ctree.h"
21#include "free-space-cache.h" 23#include "free-space-cache.h"
22#include "transaction.h" 24#include "transaction.h"
23 25
24struct btrfs_free_space { 26#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
25 struct rb_node bytes_index; 27#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
26 struct rb_node offset_index;
27 u64 offset;
28 u64 bytes;
29};
30 28
31static int tree_insert_offset(struct rb_root *root, u64 offset, 29static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
32 struct rb_node *node) 30 u64 offset)
33{ 31{
34 struct rb_node **p = &root->rb_node; 32 BUG_ON(offset < bitmap_start);
35 struct rb_node *parent = NULL; 33 offset -= bitmap_start;
36 struct btrfs_free_space *info; 34 return (unsigned long)(div64_u64(offset, sectorsize));
35}
37 36
38 while (*p) { 37static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
39 parent = *p; 38{
40 info = rb_entry(parent, struct btrfs_free_space, offset_index); 39 return (unsigned long)(div64_u64(bytes, sectorsize));
40}
41 41
42 if (offset < info->offset) 42static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
43 p = &(*p)->rb_left; 43 u64 offset)
44 else if (offset > info->offset) 44{
45 p = &(*p)->rb_right; 45 u64 bitmap_start;
46 else 46 u64 bytes_per_bitmap;
47 return -EEXIST;
48 }
49 47
50 rb_link_node(node, parent, p); 48 bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
51 rb_insert_color(node, root); 49 bitmap_start = offset - block_group->key.objectid;
50 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
51 bitmap_start *= bytes_per_bitmap;
52 bitmap_start += block_group->key.objectid;
52 53
53 return 0; 54 return bitmap_start;
54} 55}
55 56
56static int tree_insert_bytes(struct rb_root *root, u64 bytes, 57static int tree_insert_offset(struct rb_root *root, u64 offset,
57 struct rb_node *node) 58 struct rb_node *node, int bitmap)
58{ 59{
59 struct rb_node **p = &root->rb_node; 60 struct rb_node **p = &root->rb_node;
60 struct rb_node *parent = NULL; 61 struct rb_node *parent = NULL;
@@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
62 63
63 while (*p) { 64 while (*p) {
64 parent = *p; 65 parent = *p;
65 info = rb_entry(parent, struct btrfs_free_space, bytes_index); 66 info = rb_entry(parent, struct btrfs_free_space, offset_index);
66 67
67 if (bytes < info->bytes) 68 if (offset < info->offset) {
68 p = &(*p)->rb_left; 69 p = &(*p)->rb_left;
69 else 70 } else if (offset > info->offset) {
70 p = &(*p)->rb_right; 71 p = &(*p)->rb_right;
72 } else {
73 /*
74 * we could have a bitmap entry and an extent entry
75 * share the same offset. If this is the case, we want
76 * the extent entry to always be found first if we do a
77 * linear search through the tree, since we want to have
78 * the quickest allocation time, and allocating from an
79 * extent is faster than allocating from a bitmap. So
80 * if we're inserting a bitmap and we find an entry at
81 * this offset, we want to go right, or after this entry
82 * logically. If we are inserting an extent and we've
83 * found a bitmap, we want to go left, or before
84 * logically.
85 */
86 if (bitmap) {
87 WARN_ON(info->bitmap);
88 p = &(*p)->rb_right;
89 } else {
90 WARN_ON(!info->bitmap);
91 p = &(*p)->rb_left;
92 }
93 }
71 } 94 }
72 95
73 rb_link_node(node, parent, p); 96 rb_link_node(node, parent, p);
@@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
79/* 102/*
80 * searches the tree for the given offset. 103 * searches the tree for the given offset.
81 * 104 *
82 * fuzzy == 1: this is used for allocations where we are given a hint of where 105 * fuzzy - If this is set, then we are trying to make an allocation, and we just
83 * to look for free space. Because the hint may not be completely on an offset 106 * want a section that has at least bytes size and comes at or after the given
84 * mark, or the hint may no longer point to free space we need to fudge our 107 * offset.
85 * results a bit. So we look for free space starting at or after offset with at
86 * least bytes size. We prefer to find as close to the given offset as we can.
87 * Also if the offset is within a free space range, then we will return the free
88 * space that contains the given offset, which means we can return a free space
89 * chunk with an offset before the provided offset.
90 *
91 * fuzzy == 0: this is just a normal tree search. Give us the free space that
92 * starts at the given offset which is at least bytes size, and if its not there
93 * return NULL.
94 */ 108 */
95static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 109static struct btrfs_free_space *
96 u64 offset, u64 bytes, 110tree_search_offset(struct btrfs_block_group_cache *block_group,
97 int fuzzy) 111 u64 offset, int bitmap_only, int fuzzy)
98{ 112{
99 struct rb_node *n = root->rb_node; 113 struct rb_node *n = block_group->free_space_offset.rb_node;
100 struct btrfs_free_space *entry, *ret = NULL; 114 struct btrfs_free_space *entry, *prev = NULL;
115
116 /* find entry that is closest to the 'offset' */
117 while (1) {
118 if (!n) {
119 entry = NULL;
120 break;
121 }
101 122
102 while (n) {
103 entry = rb_entry(n, struct btrfs_free_space, offset_index); 123 entry = rb_entry(n, struct btrfs_free_space, offset_index);
124 prev = entry;
104 125
105 if (offset < entry->offset) { 126 if (offset < entry->offset)
106 if (fuzzy &&
107 (!ret || entry->offset < ret->offset) &&
108 (bytes <= entry->bytes))
109 ret = entry;
110 n = n->rb_left; 127 n = n->rb_left;
111 } else if (offset > entry->offset) { 128 else if (offset > entry->offset)
112 if (fuzzy &&
113 (entry->offset + entry->bytes - 1) >= offset &&
114 bytes <= entry->bytes) {
115 ret = entry;
116 break;
117 }
118 n = n->rb_right; 129 n = n->rb_right;
119 } else { 130 else
120 if (bytes > entry->bytes) {
121 n = n->rb_right;
122 continue;
123 }
124 ret = entry;
125 break; 131 break;
126 }
127 } 132 }
128 133
129 return ret; 134 if (bitmap_only) {
130} 135 if (!entry)
131 136 return NULL;
132/* 137 if (entry->bitmap)
133 * return a chunk at least bytes size, as close to offset that we can get. 138 return entry;
134 */
135static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
136 u64 offset, u64 bytes)
137{
138 struct rb_node *n = root->rb_node;
139 struct btrfs_free_space *entry, *ret = NULL;
140 139
141 while (n) { 140 /*
142 entry = rb_entry(n, struct btrfs_free_space, bytes_index); 141 * bitmap entry and extent entry may share same offset,
142 * in that case, bitmap entry comes after extent entry.
143 */
144 n = rb_next(n);
145 if (!n)
146 return NULL;
147 entry = rb_entry(n, struct btrfs_free_space, offset_index);
148 if (entry->offset != offset)
149 return NULL;
143 150
144 if (bytes < entry->bytes) { 151 WARN_ON(!entry->bitmap);
152 return entry;
153 } else if (entry) {
154 if (entry->bitmap) {
145 /* 155 /*
146 * We prefer to get a hole size as close to the size we 156 * if previous extent entry covers the offset,
147 * are asking for so we don't take small slivers out of 157 * we should return it instead of the bitmap entry
148 * huge holes, but we also want to get as close to the
149 * offset as possible so we don't have a whole lot of
150 * fragmentation.
151 */ 158 */
152 if (offset <= entry->offset) { 159 n = &entry->offset_index;
153 if (!ret) 160 while (1) {
154 ret = entry; 161 n = rb_prev(n);
155 else if (entry->bytes < ret->bytes) 162 if (!n)
156 ret = entry; 163 break;
157 else if (entry->offset < ret->offset) 164 prev = rb_entry(n, struct btrfs_free_space,
158 ret = entry; 165 offset_index);
166 if (!prev->bitmap) {
167 if (prev->offset + prev->bytes > offset)
168 entry = prev;
169 break;
170 }
159 } 171 }
160 n = n->rb_left; 172 }
161 } else if (bytes > entry->bytes) { 173 return entry;
162 n = n->rb_right; 174 }
175
176 if (!prev)
177 return NULL;
178
179 /* find last entry before the 'offset' */
180 entry = prev;
181 if (entry->offset > offset) {
182 n = rb_prev(&entry->offset_index);
183 if (n) {
184 entry = rb_entry(n, struct btrfs_free_space,
185 offset_index);
186 BUG_ON(entry->offset > offset);
163 } else { 187 } else {
164 /* 188 if (fuzzy)
165 * Ok we may have multiple chunks of the wanted size, 189 return entry;
166 * so we don't want to take the first one we find, we 190 else
167 * want to take the one closest to our given offset, so 191 return NULL;
168 * keep searching just in case theres a better match.
169 */
170 n = n->rb_right;
171 if (offset > entry->offset)
172 continue;
173 else if (!ret || entry->offset < ret->offset)
174 ret = entry;
175 } 192 }
176 } 193 }
177 194
178 return ret; 195 if (entry->bitmap) {
196 n = &entry->offset_index;
197 while (1) {
198 n = rb_prev(n);
199 if (!n)
200 break;
201 prev = rb_entry(n, struct btrfs_free_space,
202 offset_index);
203 if (!prev->bitmap) {
204 if (prev->offset + prev->bytes > offset)
205 return prev;
206 break;
207 }
208 }
209 if (entry->offset + BITS_PER_BITMAP *
210 block_group->sectorsize > offset)
211 return entry;
212 } else if (entry->offset + entry->bytes > offset)
213 return entry;
214
215 if (!fuzzy)
216 return NULL;
217
218 while (1) {
219 if (entry->bitmap) {
220 if (entry->offset + BITS_PER_BITMAP *
221 block_group->sectorsize > offset)
222 break;
223 } else {
224 if (entry->offset + entry->bytes > offset)
225 break;
226 }
227
228 n = rb_next(&entry->offset_index);
229 if (!n)
230 return NULL;
231 entry = rb_entry(n, struct btrfs_free_space, offset_index);
232 }
233 return entry;
179} 234}
180 235
181static void unlink_free_space(struct btrfs_block_group_cache *block_group, 236static void unlink_free_space(struct btrfs_block_group_cache *block_group,
182 struct btrfs_free_space *info) 237 struct btrfs_free_space *info)
183{ 238{
184 rb_erase(&info->offset_index, &block_group->free_space_offset); 239 rb_erase(&info->offset_index, &block_group->free_space_offset);
185 rb_erase(&info->bytes_index, &block_group->free_space_bytes); 240 block_group->free_extents--;
241 block_group->free_space -= info->bytes;
186} 242}
187 243
188static int link_free_space(struct btrfs_block_group_cache *block_group, 244static int link_free_space(struct btrfs_block_group_cache *block_group,
@@ -190,17 +246,314 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
190{ 246{
191 int ret = 0; 247 int ret = 0;
192 248
193 249 BUG_ON(!info->bitmap && !info->bytes);
194 BUG_ON(!info->bytes);
195 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 250 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
196 &info->offset_index); 251 &info->offset_index, (info->bitmap != NULL));
197 if (ret) 252 if (ret)
198 return ret; 253 return ret;
199 254
200 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, 255 block_group->free_space += info->bytes;
201 &info->bytes_index); 256 block_group->free_extents++;
202 if (ret) 257 return ret;
203 return ret; 258}
259
260static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
261{
262 u64 max_bytes, possible_bytes;
263
264 /*
265 * The goal is to keep the total amount of memory used per 1gb of space
266 * at or below 32k, so we need to adjust how much memory we allow to be
267 * used by extent based free space tracking
268 */
269 max_bytes = MAX_CACHE_BYTES_PER_GIG *
270 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
271
272 possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
273 (sizeof(struct btrfs_free_space) *
274 block_group->extents_thresh);
275
276 if (possible_bytes > max_bytes) {
277 int extent_bytes = max_bytes -
278 (block_group->total_bitmaps * PAGE_CACHE_SIZE);
279
280 if (extent_bytes <= 0) {
281 block_group->extents_thresh = 0;
282 return;
283 }
284
285 block_group->extents_thresh = extent_bytes /
286 (sizeof(struct btrfs_free_space));
287 }
288}
289
290static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
291 struct btrfs_free_space *info, u64 offset,
292 u64 bytes)
293{
294 unsigned long start, end;
295 unsigned long i;
296
297 start = offset_to_bit(info->offset, block_group->sectorsize, offset);
298 end = start + bytes_to_bits(bytes, block_group->sectorsize);
299 BUG_ON(end > BITS_PER_BITMAP);
300
301 for (i = start; i < end; i++)
302 clear_bit(i, info->bitmap);
303
304 info->bytes -= bytes;
305 block_group->free_space -= bytes;
306}
307
308static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
309 struct btrfs_free_space *info, u64 offset,
310 u64 bytes)
311{
312 unsigned long start, end;
313 unsigned long i;
314
315 start = offset_to_bit(info->offset, block_group->sectorsize, offset);
316 end = start + bytes_to_bits(bytes, block_group->sectorsize);
317 BUG_ON(end > BITS_PER_BITMAP);
318
319 for (i = start; i < end; i++)
320 set_bit(i, info->bitmap);
321
322 info->bytes += bytes;
323 block_group->free_space += bytes;
324}
325
326static int search_bitmap(struct btrfs_block_group_cache *block_group,
327 struct btrfs_free_space *bitmap_info, u64 *offset,
328 u64 *bytes)
329{
330 unsigned long found_bits = 0;
331 unsigned long bits, i;
332 unsigned long next_zero;
333
334 i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
335 max_t(u64, *offset, bitmap_info->offset));
336 bits = bytes_to_bits(*bytes, block_group->sectorsize);
337
338 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
339 i < BITS_PER_BITMAP;
340 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
341 next_zero = find_next_zero_bit(bitmap_info->bitmap,
342 BITS_PER_BITMAP, i);
343 if ((next_zero - i) >= bits) {
344 found_bits = next_zero - i;
345 break;
346 }
347 i = next_zero;
348 }
349
350 if (found_bits) {
351 *offset = (u64)(i * block_group->sectorsize) +
352 bitmap_info->offset;
353 *bytes = (u64)(found_bits) * block_group->sectorsize;
354 return 0;
355 }
356
357 return -1;
358}
359
360static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
361 *block_group, u64 *offset,
362 u64 *bytes, int debug)
363{
364 struct btrfs_free_space *entry;
365 struct rb_node *node;
366 int ret;
367
368 if (!block_group->free_space_offset.rb_node)
369 return NULL;
370
371 entry = tree_search_offset(block_group,
372 offset_to_bitmap(block_group, *offset),
373 0, 1);
374 if (!entry)
375 return NULL;
376
377 for (node = &entry->offset_index; node; node = rb_next(node)) {
378 entry = rb_entry(node, struct btrfs_free_space, offset_index);
379 if (entry->bytes < *bytes)
380 continue;
381
382 if (entry->bitmap) {
383 ret = search_bitmap(block_group, entry, offset, bytes);
384 if (!ret)
385 return entry;
386 continue;
387 }
388
389 *offset = entry->offset;
390 *bytes = entry->bytes;
391 return entry;
392 }
393
394 return NULL;
395}
396
397static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
398 struct btrfs_free_space *info, u64 offset)
399{
400 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
401 int max_bitmaps = (int)div64_u64(block_group->key.offset +
402 bytes_per_bg - 1, bytes_per_bg);
403 BUG_ON(block_group->total_bitmaps >= max_bitmaps);
404
405 info->offset = offset_to_bitmap(block_group, offset);
406 link_free_space(block_group, info);
407 block_group->total_bitmaps++;
408
409 recalculate_thresholds(block_group);
410}
411
412static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
413 struct btrfs_free_space *bitmap_info,
414 u64 *offset, u64 *bytes)
415{
416 u64 end;
417
418again:
419 end = bitmap_info->offset +
420 (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
421
422 if (*offset > bitmap_info->offset && *offset + *bytes > end) {
423 bitmap_clear_bits(block_group, bitmap_info, *offset,
424 end - *offset + 1);
425 *bytes -= end - *offset + 1;
426 *offset = end + 1;
427 } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
428 bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
429 *bytes = 0;
430 }
431
432 if (*bytes) {
433 if (!bitmap_info->bytes) {
434 unlink_free_space(block_group, bitmap_info);
435 kfree(bitmap_info->bitmap);
436 kfree(bitmap_info);
437 block_group->total_bitmaps--;
438 recalculate_thresholds(block_group);
439 }
440
441 bitmap_info = tree_search_offset(block_group,
442 offset_to_bitmap(block_group,
443 *offset),
444 1, 0);
445 if (!bitmap_info)
446 return -EINVAL;
447
448 if (!bitmap_info->bitmap)
449 return -EAGAIN;
450
451 goto again;
452 } else if (!bitmap_info->bytes) {
453 unlink_free_space(block_group, bitmap_info);
454 kfree(bitmap_info->bitmap);
455 kfree(bitmap_info);
456 block_group->total_bitmaps--;
457 recalculate_thresholds(block_group);
458 }
459
460 return 0;
461}
462
463static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
464 struct btrfs_free_space *info)
465{
466 struct btrfs_free_space *bitmap_info;
467 int added = 0;
468 u64 bytes, offset, end;
469 int ret;
470
471 /*
472 * If we are below the extents threshold then we can add this as an
473 * extent, and don't have to deal with the bitmap
474 */
475 if (block_group->free_extents < block_group->extents_thresh &&
476 info->bytes > block_group->sectorsize * 4)
477 return 0;
478
479 /*
480 * some block groups are so tiny they can't be enveloped by a bitmap, so
481 * don't even bother to create a bitmap for this
482 */
483 if (BITS_PER_BITMAP * block_group->sectorsize >
484 block_group->key.offset)
485 return 0;
486
487 bytes = info->bytes;
488 offset = info->offset;
489
490again:
491 bitmap_info = tree_search_offset(block_group,
492 offset_to_bitmap(block_group, offset),
493 1, 0);
494 if (!bitmap_info) {
495 BUG_ON(added);
496 goto new_bitmap;
497 }
498
499 end = bitmap_info->offset +
500 (u64)(BITS_PER_BITMAP * block_group->sectorsize);
501
502 if (offset >= bitmap_info->offset && offset + bytes > end) {
503 bitmap_set_bits(block_group, bitmap_info, offset,
504 end - offset);
505 bytes -= end - offset;
506 offset = end;
507 added = 0;
508 } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
509 bitmap_set_bits(block_group, bitmap_info, offset, bytes);
510 bytes = 0;
511 } else {
512 BUG();
513 }
514
515 if (!bytes) {
516 ret = 1;
517 goto out;
518 } else
519 goto again;
520
521new_bitmap:
522 if (info && info->bitmap) {
523 add_new_bitmap(block_group, info, offset);
524 added = 1;
525 info = NULL;
526 goto again;
527 } else {
528 spin_unlock(&block_group->tree_lock);
529
530 /* no pre-allocated info, allocate a new one */
531 if (!info) {
532 info = kzalloc(sizeof(struct btrfs_free_space),
533 GFP_NOFS);
534 if (!info) {
535 spin_lock(&block_group->tree_lock);
536 ret = -ENOMEM;
537 goto out;
538 }
539 }
540
541 /* allocate the bitmap */
542 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
543 spin_lock(&block_group->tree_lock);
544 if (!info->bitmap) {
545 ret = -ENOMEM;
546 goto out;
547 }
548 goto again;
549 }
550
551out:
552 if (info) {
553 if (info->bitmap)
554 kfree(info->bitmap);
555 kfree(info);
556 }
204 557
205 return ret; 558 return ret;
206} 559}
@@ -208,8 +561,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
208int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 561int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
209 u64 offset, u64 bytes) 562 u64 offset, u64 bytes)
210{ 563{
211 struct btrfs_free_space *right_info; 564 struct btrfs_free_space *right_info = NULL;
212 struct btrfs_free_space *left_info; 565 struct btrfs_free_space *left_info = NULL;
213 struct btrfs_free_space *info = NULL; 566 struct btrfs_free_space *info = NULL;
214 int ret = 0; 567 int ret = 0;
215 568
@@ -227,18 +580,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
227 * are adding, if there is remove that struct and add a new one to 580 * are adding, if there is remove that struct and add a new one to
228 * cover the entire range 581 * cover the entire range
229 */ 582 */
230 right_info = tree_search_offset(&block_group->free_space_offset, 583 right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
231 offset+bytes, 0, 0); 584 if (right_info && rb_prev(&right_info->offset_index))
232 left_info = tree_search_offset(&block_group->free_space_offset, 585 left_info = rb_entry(rb_prev(&right_info->offset_index),
233 offset-1, 0, 1); 586 struct btrfs_free_space, offset_index);
587 else
588 left_info = tree_search_offset(block_group, offset - 1, 0, 0);
234 589
235 if (right_info) { 590 /*
591 * If there was no extent directly to the left or right of this new
592 * extent then we know we're going to have to allocate a new extent, so
593 * before we do that see if we need to drop this into a bitmap
594 */
595 if ((!left_info || left_info->bitmap) &&
596 (!right_info || right_info->bitmap)) {
597 ret = insert_into_bitmap(block_group, info);
598
599 if (ret < 0) {
600 goto out;
601 } else if (ret) {
602 ret = 0;
603 goto out;
604 }
605 }
606
607 if (right_info && !right_info->bitmap) {
236 unlink_free_space(block_group, right_info); 608 unlink_free_space(block_group, right_info);
237 info->bytes += right_info->bytes; 609 info->bytes += right_info->bytes;
238 kfree(right_info); 610 kfree(right_info);
239 } 611 }
240 612
241 if (left_info && left_info->offset + left_info->bytes == offset) { 613 if (left_info && !left_info->bitmap &&
614 left_info->offset + left_info->bytes == offset) {
242 unlink_free_space(block_group, left_info); 615 unlink_free_space(block_group, left_info);
243 info->offset = left_info->offset; 616 info->offset = left_info->offset;
244 info->bytes += left_info->bytes; 617 info->bytes += left_info->bytes;
@@ -248,11 +621,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
248 ret = link_free_space(block_group, info); 621 ret = link_free_space(block_group, info);
249 if (ret) 622 if (ret)
250 kfree(info); 623 kfree(info);
251 624out:
252 spin_unlock(&block_group->tree_lock); 625 spin_unlock(&block_group->tree_lock);
253 626
254 if (ret) { 627 if (ret) {
255 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 628 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
256 BUG_ON(ret == -EEXIST); 629 BUG_ON(ret == -EEXIST);
257 } 630 }
258 631
@@ -263,40 +636,65 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
263 u64 offset, u64 bytes) 636 u64 offset, u64 bytes)
264{ 637{
265 struct btrfs_free_space *info; 638 struct btrfs_free_space *info;
639 struct btrfs_free_space *next_info = NULL;
266 int ret = 0; 640 int ret = 0;
267 641
268 spin_lock(&block_group->tree_lock); 642 spin_lock(&block_group->tree_lock);
269 643
270 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 644again:
271 1); 645 info = tree_search_offset(block_group, offset, 0, 0);
272 if (info && info->offset == offset) { 646 if (!info) {
273 if (info->bytes < bytes) { 647 WARN_ON(1);
274 printk(KERN_ERR "Found free space at %llu, size %llu," 648 goto out_lock;
275 "trying to use %llu\n", 649 }
276 (unsigned long long)info->offset, 650
277 (unsigned long long)info->bytes, 651 if (info->bytes < bytes && rb_next(&info->offset_index)) {
278 (unsigned long long)bytes); 652 u64 end;
653 next_info = rb_entry(rb_next(&info->offset_index),
654 struct btrfs_free_space,
655 offset_index);
656
657 if (next_info->bitmap)
658 end = next_info->offset + BITS_PER_BITMAP *
659 block_group->sectorsize - 1;
660 else
661 end = next_info->offset + next_info->bytes;
662
663 if (next_info->bytes < bytes ||
664 next_info->offset > offset || offset > end) {
665 printk(KERN_CRIT "Found free space at %llu, size %llu,"
666 " trying to use %llu\n",
667 (unsigned long long)info->offset,
668 (unsigned long long)info->bytes,
669 (unsigned long long)bytes);
279 WARN_ON(1); 670 WARN_ON(1);
280 ret = -EINVAL; 671 ret = -EINVAL;
281 spin_unlock(&block_group->tree_lock); 672 goto out_lock;
282 goto out;
283 } 673 }
284 unlink_free_space(block_group, info);
285 674
286 if (info->bytes == bytes) { 675 info = next_info;
287 kfree(info); 676 }
288 spin_unlock(&block_group->tree_lock); 677
289 goto out; 678 if (info->bytes == bytes) {
679 unlink_free_space(block_group, info);
680 if (info->bitmap) {
681 kfree(info->bitmap);
682 block_group->total_bitmaps--;
290 } 683 }
684 kfree(info);
685 goto out_lock;
686 }
291 687
688 if (!info->bitmap && info->offset == offset) {
689 unlink_free_space(block_group, info);
292 info->offset += bytes; 690 info->offset += bytes;
293 info->bytes -= bytes; 691 info->bytes -= bytes;
692 link_free_space(block_group, info);
693 goto out_lock;
694 }
294 695
295 ret = link_free_space(block_group, info); 696 if (!info->bitmap && info->offset <= offset &&
296 spin_unlock(&block_group->tree_lock); 697 info->offset + info->bytes >= offset + bytes) {
297 BUG_ON(ret);
298 } else if (info && info->offset < offset &&
299 info->offset + info->bytes >= offset + bytes) {
300 u64 old_start = info->offset; 698 u64 old_start = info->offset;
301 /* 699 /*
302 * we're freeing space in the middle of the info, 700 * we're freeing space in the middle of the info,
@@ -312,7 +710,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
312 info->offset = offset + bytes; 710 info->offset = offset + bytes;
313 info->bytes = old_end - info->offset; 711 info->bytes = old_end - info->offset;
314 ret = link_free_space(block_group, info); 712 ret = link_free_space(block_group, info);
315 BUG_ON(ret); 713 WARN_ON(ret);
714 if (ret)
715 goto out_lock;
316 } else { 716 } else {
317 /* the hole we're creating ends at the end 717 /* the hole we're creating ends at the end
318 * of the info struct, just free the info 718 * of the info struct, just free the info
@@ -320,32 +720,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
320 kfree(info); 720 kfree(info);
321 } 721 }
322 spin_unlock(&block_group->tree_lock); 722 spin_unlock(&block_group->tree_lock);
323 /* step two, insert a new info struct to cover anything 723
324 * before the hole 724 /* step two, insert a new info struct to cover
725 * anything before the hole
325 */ 726 */
326 ret = btrfs_add_free_space(block_group, old_start, 727 ret = btrfs_add_free_space(block_group, old_start,
327 offset - old_start); 728 offset - old_start);
328 BUG_ON(ret); 729 WARN_ON(ret);
329 } else { 730 goto out;
330 spin_unlock(&block_group->tree_lock);
331 if (!info) {
332 printk(KERN_ERR "couldn't find space %llu to free\n",
333 (unsigned long long)offset);
334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
335 block_group->cached,
336 (unsigned long long)block_group->key.objectid,
337 (unsigned long long)block_group->key.offset);
338 btrfs_dump_free_space(block_group, bytes);
339 } else if (info) {
340 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
341 "but wanted offset=%llu bytes=%llu\n",
342 (unsigned long long)info->offset,
343 (unsigned long long)info->bytes,
344 (unsigned long long)offset,
345 (unsigned long long)bytes);
346 }
347 WARN_ON(1);
348 } 731 }
732
733 ret = remove_from_bitmap(block_group, info, &offset, &bytes);
734 if (ret == -EAGAIN)
735 goto again;
736 BUG_ON(ret);
737out_lock:
738 spin_unlock(&block_group->tree_lock);
349out: 739out:
350 return ret; 740 return ret;
351} 741}
@@ -361,10 +751,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
361 info = rb_entry(n, struct btrfs_free_space, offset_index); 751 info = rb_entry(n, struct btrfs_free_space, offset_index);
362 if (info->bytes >= bytes) 752 if (info->bytes >= bytes)
363 count++; 753 count++;
364 printk(KERN_ERR "entry offset %llu, bytes %llu\n", 754 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
365 (unsigned long long)info->offset, 755 (unsigned long long)info->offset,
366 (unsigned long long)info->bytes); 756 (unsigned long long)info->bytes,
757 (info->bitmap) ? "yes" : "no");
367 } 758 }
759 printk(KERN_INFO "block group has cluster?: %s\n",
760 list_empty(&block_group->cluster_list) ? "no" : "yes");
368 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 761 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
369 "\n", count); 762 "\n", count);
370} 763}
@@ -397,26 +790,35 @@ __btrfs_return_cluster_to_free_space(
397{ 790{
398 struct btrfs_free_space *entry; 791 struct btrfs_free_space *entry;
399 struct rb_node *node; 792 struct rb_node *node;
793 bool bitmap;
400 794
401 spin_lock(&cluster->lock); 795 spin_lock(&cluster->lock);
402 if (cluster->block_group != block_group) 796 if (cluster->block_group != block_group)
403 goto out; 797 goto out;
404 798
799 bitmap = cluster->points_to_bitmap;
800 cluster->block_group = NULL;
405 cluster->window_start = 0; 801 cluster->window_start = 0;
802 list_del_init(&cluster->block_group_list);
803 cluster->points_to_bitmap = false;
804
805 if (bitmap)
806 goto out;
807
406 node = rb_first(&cluster->root); 808 node = rb_first(&cluster->root);
407 while(node) { 809 while (node) {
408 entry = rb_entry(node, struct btrfs_free_space, offset_index); 810 entry = rb_entry(node, struct btrfs_free_space, offset_index);
409 node = rb_next(&entry->offset_index); 811 node = rb_next(&entry->offset_index);
410 rb_erase(&entry->offset_index, &cluster->root); 812 rb_erase(&entry->offset_index, &cluster->root);
411 link_free_space(block_group, entry); 813 BUG_ON(entry->bitmap);
814 tree_insert_offset(&block_group->free_space_offset,
815 entry->offset, &entry->offset_index, 0);
412 } 816 }
413 list_del_init(&cluster->block_group_list);
414
415 btrfs_put_block_group(cluster->block_group);
416 cluster->block_group = NULL;
417 cluster->root.rb_node = NULL; 817 cluster->root.rb_node = NULL;
818
418out: 819out:
419 spin_unlock(&cluster->lock); 820 spin_unlock(&cluster->lock);
821 btrfs_put_block_group(block_group);
420 return 0; 822 return 0;
421} 823}
422 824
@@ -425,20 +827,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
425 struct btrfs_free_space *info; 827 struct btrfs_free_space *info;
426 struct rb_node *node; 828 struct rb_node *node;
427 struct btrfs_free_cluster *cluster; 829 struct btrfs_free_cluster *cluster;
428 struct btrfs_free_cluster *safe; 830 struct list_head *head;
429 831
430 spin_lock(&block_group->tree_lock); 832 spin_lock(&block_group->tree_lock);
431 833 while ((head = block_group->cluster_list.next) !=
432 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, 834 &block_group->cluster_list) {
433 block_group_list) { 835 cluster = list_entry(head, struct btrfs_free_cluster,
836 block_group_list);
434 837
435 WARN_ON(cluster->block_group != block_group); 838 WARN_ON(cluster->block_group != block_group);
436 __btrfs_return_cluster_to_free_space(block_group, cluster); 839 __btrfs_return_cluster_to_free_space(block_group, cluster);
840 if (need_resched()) {
841 spin_unlock(&block_group->tree_lock);
842 cond_resched();
843 spin_lock(&block_group->tree_lock);
844 }
437 } 845 }
438 846
439 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 847 while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
440 info = rb_entry(node, struct btrfs_free_space, bytes_index); 848 info = rb_entry(node, struct btrfs_free_space, offset_index);
441 unlink_free_space(block_group, info); 849 unlink_free_space(block_group, info);
850 if (info->bitmap)
851 kfree(info->bitmap);
442 kfree(info); 852 kfree(info);
443 if (need_resched()) { 853 if (need_resched()) {
444 spin_unlock(&block_group->tree_lock); 854 spin_unlock(&block_group->tree_lock);
@@ -446,6 +856,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
446 spin_lock(&block_group->tree_lock); 856 spin_lock(&block_group->tree_lock);
447 } 857 }
448 } 858 }
859
449 spin_unlock(&block_group->tree_lock); 860 spin_unlock(&block_group->tree_lock);
450} 861}
451 862
@@ -453,25 +864,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
453 u64 offset, u64 bytes, u64 empty_size) 864 u64 offset, u64 bytes, u64 empty_size)
454{ 865{
455 struct btrfs_free_space *entry = NULL; 866 struct btrfs_free_space *entry = NULL;
867 u64 bytes_search = bytes + empty_size;
456 u64 ret = 0; 868 u64 ret = 0;
457 869
458 spin_lock(&block_group->tree_lock); 870 spin_lock(&block_group->tree_lock);
459 entry = tree_search_offset(&block_group->free_space_offset, offset, 871 entry = find_free_space(block_group, &offset, &bytes_search, 0);
460 bytes + empty_size, 1);
461 if (!entry) 872 if (!entry)
462 entry = tree_search_bytes(&block_group->free_space_bytes, 873 goto out;
463 offset, bytes + empty_size); 874
464 if (entry) { 875 ret = offset;
876 if (entry->bitmap) {
877 bitmap_clear_bits(block_group, entry, offset, bytes);
878 if (!entry->bytes) {
879 unlink_free_space(block_group, entry);
880 kfree(entry->bitmap);
881 kfree(entry);
882 block_group->total_bitmaps--;
883 recalculate_thresholds(block_group);
884 }
885 } else {
465 unlink_free_space(block_group, entry); 886 unlink_free_space(block_group, entry);
466 ret = entry->offset;
467 entry->offset += bytes; 887 entry->offset += bytes;
468 entry->bytes -= bytes; 888 entry->bytes -= bytes;
469
470 if (!entry->bytes) 889 if (!entry->bytes)
471 kfree(entry); 890 kfree(entry);
472 else 891 else
473 link_free_space(block_group, entry); 892 link_free_space(block_group, entry);
474 } 893 }
894
895out:
475 spin_unlock(&block_group->tree_lock); 896 spin_unlock(&block_group->tree_lock);
476 897
477 return ret; 898 return ret;
@@ -517,6 +938,47 @@ int btrfs_return_cluster_to_free_space(
517 return ret; 938 return ret;
518} 939}
519 940
941static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
942 struct btrfs_free_cluster *cluster,
943 u64 bytes, u64 min_start)
944{
945 struct btrfs_free_space *entry;
946 int err;
947 u64 search_start = cluster->window_start;
948 u64 search_bytes = bytes;
949 u64 ret = 0;
950
951 spin_lock(&block_group->tree_lock);
952 spin_lock(&cluster->lock);
953
954 if (!cluster->points_to_bitmap)
955 goto out;
956
957 if (cluster->block_group != block_group)
958 goto out;
959
960 entry = tree_search_offset(block_group, search_start, 0, 0);
961
962 if (!entry || !entry->bitmap)
963 goto out;
964
965 search_start = min_start;
966 search_bytes = bytes;
967
968 err = search_bitmap(block_group, entry, &search_start,
969 &search_bytes);
970 if (err)
971 goto out;
972
973 ret = search_start;
974 bitmap_clear_bits(block_group, entry, ret, bytes);
975out:
976 spin_unlock(&cluster->lock);
977 spin_unlock(&block_group->tree_lock);
978
979 return ret;
980}
981
520/* 982/*
521 * given a cluster, try to allocate 'bytes' from it, returns 0 983 * given a cluster, try to allocate 'bytes' from it, returns 0
522 * if it couldn't find anything suitably large, or a logical disk offset 984 * if it couldn't find anything suitably large, or a logical disk offset
@@ -530,6 +992,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
530 struct rb_node *node; 992 struct rb_node *node;
531 u64 ret = 0; 993 u64 ret = 0;
532 994
995 if (cluster->points_to_bitmap)
996 return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
997 min_start);
998
533 spin_lock(&cluster->lock); 999 spin_lock(&cluster->lock);
534 if (bytes > cluster->max_size) 1000 if (bytes > cluster->max_size)
535 goto out; 1001 goto out;
@@ -567,9 +1033,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
567 } 1033 }
568out: 1034out:
569 spin_unlock(&cluster->lock); 1035 spin_unlock(&cluster->lock);
1036
570 return ret; 1037 return ret;
571} 1038}
572 1039
1040static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1041 struct btrfs_free_space *entry,
1042 struct btrfs_free_cluster *cluster,
1043 u64 offset, u64 bytes, u64 min_bytes)
1044{
1045 unsigned long next_zero;
1046 unsigned long i;
1047 unsigned long search_bits;
1048 unsigned long total_bits;
1049 unsigned long found_bits;
1050 unsigned long start = 0;
1051 unsigned long total_found = 0;
1052 bool found = false;
1053
1054 i = offset_to_bit(entry->offset, block_group->sectorsize,
1055 max_t(u64, offset, entry->offset));
1056 search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
1057 total_bits = bytes_to_bits(bytes, block_group->sectorsize);
1058
1059again:
1060 found_bits = 0;
1061 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
1062 i < BITS_PER_BITMAP;
1063 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
1064 next_zero = find_next_zero_bit(entry->bitmap,
1065 BITS_PER_BITMAP, i);
1066 if (next_zero - i >= search_bits) {
1067 found_bits = next_zero - i;
1068 break;
1069 }
1070 i = next_zero;
1071 }
1072
1073 if (!found_bits)
1074 return -1;
1075
1076 if (!found) {
1077 start = i;
1078 found = true;
1079 }
1080
1081 total_found += found_bits;
1082
1083 if (cluster->max_size < found_bits * block_group->sectorsize)
1084 cluster->max_size = found_bits * block_group->sectorsize;
1085
1086 if (total_found < total_bits) {
1087 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
1088 if (i - start > total_bits * 2) {
1089 total_found = 0;
1090 cluster->max_size = 0;
1091 found = false;
1092 }
1093 goto again;
1094 }
1095
1096 cluster->window_start = start * block_group->sectorsize +
1097 entry->offset;
1098 cluster->points_to_bitmap = true;
1099
1100 return 0;
1101}
1102
573/* 1103/*
574 * here we try to find a cluster of blocks in a block group. The goal 1104 * here we try to find a cluster of blocks in a block group. The goal
575 * is to find at least bytes free and up to empty_size + bytes free. 1105 * is to find at least bytes free and up to empty_size + bytes free.
@@ -587,12 +1117,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
587 struct btrfs_free_space *entry = NULL; 1117 struct btrfs_free_space *entry = NULL;
588 struct rb_node *node; 1118 struct rb_node *node;
589 struct btrfs_free_space *next; 1119 struct btrfs_free_space *next;
590 struct btrfs_free_space *last; 1120 struct btrfs_free_space *last = NULL;
591 u64 min_bytes; 1121 u64 min_bytes;
592 u64 window_start; 1122 u64 window_start;
593 u64 window_free; 1123 u64 window_free;
594 u64 max_extent = 0; 1124 u64 max_extent = 0;
595 int total_retries = 0; 1125 bool found_bitmap = false;
596 int ret; 1126 int ret;
597 1127
598 /* for metadata, allow allocates with more holes */ 1128 /* for metadata, allow allocates with more holes */
@@ -620,31 +1150,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
620 goto out; 1150 goto out;
621 } 1151 }
622again: 1152again:
623 min_bytes = min(min_bytes, bytes + empty_size); 1153 entry = tree_search_offset(block_group, offset, found_bitmap, 1);
624 entry = tree_search_bytes(&block_group->free_space_bytes,
625 offset, min_bytes);
626 if (!entry) { 1154 if (!entry) {
627 ret = -ENOSPC; 1155 ret = -ENOSPC;
628 goto out; 1156 goto out;
629 } 1157 }
1158
1159 /*
1160 * If found_bitmap is true, we exhausted our search for extent entries,
1161 * and we just want to search all of the bitmaps that we can find, and
1162 * ignore any extent entries we find.
1163 */
1164 while (entry->bitmap || found_bitmap ||
1165 (!entry->bitmap && entry->bytes < min_bytes)) {
1166 struct rb_node *node = rb_next(&entry->offset_index);
1167
1168 if (entry->bitmap && entry->bytes > bytes + empty_size) {
1169 ret = btrfs_bitmap_cluster(block_group, entry, cluster,
1170 offset, bytes + empty_size,
1171 min_bytes);
1172 if (!ret)
1173 goto got_it;
1174 }
1175
1176 if (!node) {
1177 ret = -ENOSPC;
1178 goto out;
1179 }
1180 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1181 }
1182
1183 /*
1184 * We already searched all the extent entries from the passed in offset
1185 * to the end and didn't find enough space for the cluster, and we also
1186 * didn't find any bitmaps that met our criteria, just go ahead and exit
1187 */
1188 if (found_bitmap) {
1189 ret = -ENOSPC;
1190 goto out;
1191 }
1192
1193 cluster->points_to_bitmap = false;
630 window_start = entry->offset; 1194 window_start = entry->offset;
631 window_free = entry->bytes; 1195 window_free = entry->bytes;
632 last = entry; 1196 last = entry;
633 max_extent = entry->bytes; 1197 max_extent = entry->bytes;
634 1198
635 while(1) { 1199 while (1) {
636 /* out window is just right, lets fill it */ 1200 /* out window is just right, lets fill it */
637 if (window_free >= bytes + empty_size) 1201 if (window_free >= bytes + empty_size)
638 break; 1202 break;
639 1203
640 node = rb_next(&last->offset_index); 1204 node = rb_next(&last->offset_index);
641 if (!node) { 1205 if (!node) {
1206 if (found_bitmap)
1207 goto again;
642 ret = -ENOSPC; 1208 ret = -ENOSPC;
643 goto out; 1209 goto out;
644 } 1210 }
645 next = rb_entry(node, struct btrfs_free_space, offset_index); 1211 next = rb_entry(node, struct btrfs_free_space, offset_index);
646 1212
647 /* 1213 /*
1214 * we found a bitmap, so if this search doesn't result in a
1215 * cluster, we know to go and search again for the bitmaps and
1216 * start looking for space there
1217 */
1218 if (next->bitmap) {
1219 if (!found_bitmap)
1220 offset = next->offset;
1221 found_bitmap = true;
1222 last = next;
1223 continue;
1224 }
1225
1226 /*
648 * we haven't filled the empty size and the window is 1227 * we haven't filled the empty size and the window is
649 * very large. reset and try again 1228 * very large. reset and try again
650 */ 1229 */
@@ -655,19 +1234,6 @@ again:
655 window_free = entry->bytes; 1234 window_free = entry->bytes;
656 last = entry; 1235 last = entry;
657 max_extent = 0; 1236 max_extent = 0;
658 total_retries++;
659 if (total_retries % 64 == 0) {
660 if (min_bytes >= (bytes + empty_size)) {
661 ret = -ENOSPC;
662 goto out;
663 }
664 /*
665 * grow our allocation a bit, we're not having
666 * much luck
667 */
668 min_bytes *= 2;
669 goto again;
670 }
671 } else { 1237 } else {
672 last = next; 1238 last = next;
673 window_free += next->bytes; 1239 window_free += next->bytes;
@@ -685,11 +1251,19 @@ again:
685 * The cluster includes an rbtree, but only uses the offset index 1251 * The cluster includes an rbtree, but only uses the offset index
686 * of each free space cache entry. 1252 * of each free space cache entry.
687 */ 1253 */
688 while(1) { 1254 while (1) {
689 node = rb_next(&entry->offset_index); 1255 node = rb_next(&entry->offset_index);
690 unlink_free_space(block_group, entry); 1256 if (entry->bitmap && node) {
1257 entry = rb_entry(node, struct btrfs_free_space,
1258 offset_index);
1259 continue;
1260 } else if (entry->bitmap && !node) {
1261 break;
1262 }
1263
1264 rb_erase(&entry->offset_index, &block_group->free_space_offset);
691 ret = tree_insert_offset(&cluster->root, entry->offset, 1265 ret = tree_insert_offset(&cluster->root, entry->offset,
692 &entry->offset_index); 1266 &entry->offset_index, 0);
693 BUG_ON(ret); 1267 BUG_ON(ret);
694 1268
695 if (!node || entry == last) 1269 if (!node || entry == last)
@@ -697,8 +1271,10 @@ again:
697 1271
698 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1272 entry = rb_entry(node, struct btrfs_free_space, offset_index);
699 } 1273 }
700 ret = 0; 1274
701 cluster->max_size = max_extent; 1275 cluster->max_size = max_extent;
1276got_it:
1277 ret = 0;
702 atomic_inc(&block_group->count); 1278 atomic_inc(&block_group->count);
703 list_add_tail(&cluster->block_group_list, &block_group->cluster_list); 1279 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
704 cluster->block_group = block_group; 1280 cluster->block_group = block_group;
@@ -718,6 +1294,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
718 spin_lock_init(&cluster->refill_lock); 1294 spin_lock_init(&cluster->refill_lock);
719 cluster->root.rb_node = NULL; 1295 cluster->root.rb_node = NULL;
720 cluster->max_size = 0; 1296 cluster->max_size = 0;
1297 cluster->points_to_bitmap = false;
721 INIT_LIST_HEAD(&cluster->block_group_list); 1298 INIT_LIST_HEAD(&cluster->block_group_list);
722 cluster->block_group = NULL; 1299 cluster->block_group = NULL;
723} 1300}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 266fb8764054..890a8e79011b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -19,6 +19,14 @@
19#ifndef __BTRFS_FREE_SPACE_CACHE 19#ifndef __BTRFS_FREE_SPACE_CACHE
20#define __BTRFS_FREE_SPACE_CACHE 20#define __BTRFS_FREE_SPACE_CACHE
21 21
22struct btrfs_free_space {
23 struct rb_node offset_index;
24 u64 offset;
25 u64 bytes;
26 unsigned long *bitmap;
27 struct list_head list;
28};
29
22int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 30int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
23 u64 bytenr, u64 size); 31 u64 bytenr, u64 size);
24int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 32int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 791eab19e330..56fe83fa60c4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2603,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2603 if (root->ref_cows) 2603 if (root->ref_cows)
2604 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 2604 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2605 path = btrfs_alloc_path(); 2605 path = btrfs_alloc_path();
2606 path->reada = -1;
2607 BUG_ON(!path); 2606 BUG_ON(!path);
2607 path->reada = -1;
2608 2608
2609 /* FIXME, add redo link to tree so we don't leak on crash */ 2609 /* FIXME, add redo link to tree so we don't leak on crash */
2610 key.objectid = inode->i_ino; 2610 key.objectid = inode->i_ino;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6d6523da0a30..0d126be22b63 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
309 } 309 }
310 printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", 310 printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
311 (unsigned long long)btrfs_header_bytenr(c), 311 (unsigned long long)btrfs_header_bytenr(c),
312 btrfs_header_level(c), nr, 312 level, nr,
313 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); 313 (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
314 for (i = 0; i < nr; i++) { 314 for (i = 0; i < nr; i++) {
315 btrfs_node_key_to_cpu(c, &key, i); 315 btrfs_node_key_to_cpu(c, &key, i);
@@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
326 btrfs_level_size(root, level - 1), 326 btrfs_level_size(root, level - 1),
327 btrfs_node_ptr_generation(c, i)); 327 btrfs_node_ptr_generation(c, i));
328 if (btrfs_is_leaf(next) && 328 if (btrfs_is_leaf(next) &&
329 btrfs_header_level(c) != 1) 329 level != 1)
330 BUG(); 330 BUG();
331 if (btrfs_header_level(next) != 331 if (btrfs_header_level(next) !=
332 btrfs_header_level(c) - 1) 332 level - 1)
333 BUG(); 333 BUG();
334 btrfs_print_tree(root, next); 334 btrfs_print_tree(root, next);
335 free_extent_buffer(next); 335 free_extent_buffer(next);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 008397934778..e71264d1c2c9 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -670,6 +670,8 @@ again:
670 err = ret; 670 err = ret;
671 goto out; 671 goto out;
672 } 672 }
673 if (ret > 0 && path2->slots[level] > 0)
674 path2->slots[level]--;
673 675
674 eb = path2->nodes[level]; 676 eb = path2->nodes[level];
675 WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != 677 WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
@@ -1609,6 +1611,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1609 BUG_ON(level == 0); 1611 BUG_ON(level == 0);
1610 path->lowest_level = level; 1612 path->lowest_level = level;
1611 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1613 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1614 path->lowest_level = 0;
1612 if (ret < 0) { 1615 if (ret < 0) {
1613 btrfs_free_path(path); 1616 btrfs_free_path(path);
1614 return ret; 1617 return ret;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2dbf1c1f56ee..e51d2bc532f8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,6 +40,14 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
40 } 40 }
41} 41}
42 42
43static noinline void switch_commit_root(struct btrfs_root *root)
44{
45 down_write(&root->commit_root_sem);
46 free_extent_buffer(root->commit_root);
47 root->commit_root = btrfs_root_node(root);
48 up_write(&root->commit_root_sem);
49}
50
43/* 51/*
44 * either allocate a new transaction or hop into the existing one 52 * either allocate a new transaction or hop into the existing one
45 */ 53 */
@@ -444,9 +452,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
444 452
445 btrfs_write_dirty_block_groups(trans, root); 453 btrfs_write_dirty_block_groups(trans, root);
446 454
447 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
448 BUG_ON(ret);
449
450 while (1) { 455 while (1) {
451 old_root_bytenr = btrfs_root_bytenr(&root->root_item); 456 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
452 if (old_root_bytenr == root->node->start) 457 if (old_root_bytenr == root->node->start)
@@ -457,13 +462,11 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
457 &root->root_key, 462 &root->root_key,
458 &root->root_item); 463 &root->root_item);
459 BUG_ON(ret); 464 BUG_ON(ret);
460 btrfs_write_dirty_block_groups(trans, root);
461 465
462 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 466 ret = btrfs_write_dirty_block_groups(trans, root);
463 BUG_ON(ret); 467 BUG_ON(ret);
464 } 468 }
465 free_extent_buffer(root->commit_root); 469 switch_commit_root(root);
466 root->commit_root = btrfs_root_node(root);
467 return 0; 470 return 0;
468} 471}
469 472
@@ -495,9 +498,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
495 root = list_entry(next, struct btrfs_root, dirty_list); 498 root = list_entry(next, struct btrfs_root, dirty_list);
496 499
497 update_cowonly_root(trans, root); 500 update_cowonly_root(trans, root);
498
499 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
500 BUG_ON(ret);
501 } 501 }
502 return 0; 502 return 0;
503} 503}
@@ -544,8 +544,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
544 btrfs_update_reloc_root(trans, root); 544 btrfs_update_reloc_root(trans, root);
545 545
546 if (root->commit_root != root->node) { 546 if (root->commit_root != root->node) {
547 free_extent_buffer(root->commit_root); 547 switch_commit_root(root);
548 root->commit_root = btrfs_root_node(root);
549 btrfs_set_root_node(&root->root_item, 548 btrfs_set_root_node(&root->root_item,
550 root->node); 549 root->node);
551 } 550 }
@@ -943,9 +942,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
943 942
944 mutex_unlock(&root->fs_info->trans_mutex); 943 mutex_unlock(&root->fs_info->trans_mutex);
945 944
946 if (flush_on_commit || snap_pending) { 945 if (flush_on_commit) {
947 if (flush_on_commit) 946 btrfs_start_delalloc_inodes(root);
948 btrfs_start_delalloc_inodes(root); 947 ret = btrfs_wait_ordered_extents(root, 0);
948 BUG_ON(ret);
949 } else if (snap_pending) {
949 ret = btrfs_wait_ordered_extents(root, 1); 950 ret = btrfs_wait_ordered_extents(root, 1);
950 BUG_ON(ret); 951 BUG_ON(ret);
951 } 952 }
@@ -1009,15 +1010,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1009 1010
1010 btrfs_set_root_node(&root->fs_info->tree_root->root_item, 1011 btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1011 root->fs_info->tree_root->node); 1012 root->fs_info->tree_root->node);
1012 free_extent_buffer(root->fs_info->tree_root->commit_root); 1013 switch_commit_root(root->fs_info->tree_root);
1013 root->fs_info->tree_root->commit_root =
1014 btrfs_root_node(root->fs_info->tree_root);
1015 1014
1016 btrfs_set_root_node(&root->fs_info->chunk_root->root_item, 1015 btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
1017 root->fs_info->chunk_root->node); 1016 root->fs_info->chunk_root->node);
1018 free_extent_buffer(root->fs_info->chunk_root->commit_root); 1017 switch_commit_root(root->fs_info->chunk_root);
1019 root->fs_info->chunk_root->commit_root =
1020 btrfs_root_node(root->fs_info->chunk_root);
1021 1018
1022 update_super_roots(root); 1019 update_super_roots(root);
1023 1020
@@ -1057,6 +1054,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1057 cur_trans->commit_done = 1; 1054 cur_trans->commit_done = 1;
1058 1055
1059 root->fs_info->last_trans_committed = cur_trans->transid; 1056 root->fs_info->last_trans_committed = cur_trans->transid;
1057
1060 wake_up(&cur_trans->commit_wait); 1058 wake_up(&cur_trans->commit_wait);
1061 1059
1062 put_transaction(cur_trans); 1060 put_transaction(cur_trans);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c13922206d1b..d91b0de7c502 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
797 return -ENOENT; 797 return -ENOENT;
798 798
799 inode = read_one_inode(root, key->objectid); 799 inode = read_one_inode(root, key->objectid);
800 BUG_ON(!dir); 800 BUG_ON(!inode);
801 801
802 ref_ptr = btrfs_item_ptr_offset(eb, slot); 802 ref_ptr = btrfs_item_ptr_offset(eb, slot);
803 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 803 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3ab80e9cd767..5dbefd11b4af 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -721,7 +721,8 @@ error:
721 */ 721 */
722static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, 722static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
723 struct btrfs_device *device, 723 struct btrfs_device *device,
724 u64 num_bytes, u64 *start) 724 u64 num_bytes, u64 *start,
725 u64 *max_avail)
725{ 726{
726 struct btrfs_key key; 727 struct btrfs_key key;
727 struct btrfs_root *root = device->dev_root; 728 struct btrfs_root *root = device->dev_root;
@@ -758,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
758 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 759 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
759 if (ret < 0) 760 if (ret < 0)
760 goto error; 761 goto error;
761 ret = btrfs_previous_item(root, path, 0, key.type); 762 if (ret > 0) {
762 if (ret < 0) 763 ret = btrfs_previous_item(root, path, key.objectid, key.type);
763 goto error; 764 if (ret < 0)
765 goto error;
766 if (ret > 0)
767 start_found = 1;
768 }
764 l = path->nodes[0]; 769 l = path->nodes[0];
765 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 770 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
766 while (1) { 771 while (1) {
@@ -803,6 +808,10 @@ no_more_items:
803 if (last_byte < search_start) 808 if (last_byte < search_start)
804 last_byte = search_start; 809 last_byte = search_start;
805 hole_size = key.offset - last_byte; 810 hole_size = key.offset - last_byte;
811
812 if (hole_size > *max_avail)
813 *max_avail = hole_size;
814
806 if (key.offset > last_byte && 815 if (key.offset > last_byte &&
807 hole_size >= num_bytes) { 816 hole_size >= num_bytes) {
808 *start = last_byte; 817 *start = last_byte;
@@ -1621,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1621 device->fs_devices->total_rw_bytes += diff; 1630 device->fs_devices->total_rw_bytes += diff;
1622 1631
1623 device->total_bytes = new_size; 1632 device->total_bytes = new_size;
1633 device->disk_total_bytes = new_size;
1624 btrfs_clear_space_info_full(device->dev_root->fs_info); 1634 btrfs_clear_space_info_full(device->dev_root->fs_info);
1625 1635
1626 return btrfs_update_device(trans, device); 1636 return btrfs_update_device(trans, device);
@@ -2007,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2007 goto done; 2017 goto done;
2008 if (ret) { 2018 if (ret) {
2009 ret = 0; 2019 ret = 0;
2010 goto done; 2020 break;
2011 } 2021 }
2012 2022
2013 l = path->nodes[0]; 2023 l = path->nodes[0];
@@ -2015,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2015 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2025 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2016 2026
2017 if (key.objectid != device->devid) 2027 if (key.objectid != device->devid)
2018 goto done; 2028 break;
2019 2029
2020 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2030 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2021 length = btrfs_dev_extent_length(l, dev_extent); 2031 length = btrfs_dev_extent_length(l, dev_extent);
@@ -2171,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2171 max_chunk_size); 2181 max_chunk_size);
2172 2182
2173again: 2183again:
2184 max_avail = 0;
2174 if (!map || map->num_stripes != num_stripes) { 2185 if (!map || map->num_stripes != num_stripes) {
2175 kfree(map); 2186 kfree(map);
2176 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2187 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -2219,7 +2230,8 @@ again:
2219 2230
2220 if (device->in_fs_metadata && avail >= min_free) { 2231 if (device->in_fs_metadata && avail >= min_free) {
2221 ret = find_free_dev_extent(trans, device, 2232 ret = find_free_dev_extent(trans, device,
2222 min_free, &dev_offset); 2233 min_free, &dev_offset,
2234 &max_avail);
2223 if (ret == 0) { 2235 if (ret == 0) {
2224 list_move_tail(&device->dev_alloc_list, 2236 list_move_tail(&device->dev_alloc_list,
2225 &private_devs); 2237 &private_devs);
@@ -2795,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2795 } 2807 }
2796 } 2808 }
2797 2809
2798 for (i = 0; i > nr; i++) {
2799 struct btrfs_multi_bio *multi;
2800 struct btrfs_bio_stripe *stripe;
2801 int ret;
2802
2803 length = 1;
2804 ret = btrfs_map_block(map_tree, WRITE, buf[i],
2805 &length, &multi, 0);
2806 BUG_ON(ret);
2807
2808 stripe = multi->stripes;
2809 for (j = 0; j < multi->num_stripes; j++) {
2810 if (stripe->physical >= physical &&
2811 physical < stripe->physical + length)
2812 break;
2813 }
2814 BUG_ON(j >= multi->num_stripes);
2815 kfree(multi);
2816 }
2817
2818 *logical = buf; 2810 *logical = buf;
2819 *naddrs = nr; 2811 *naddrs = nr;
2820 *stripe_len = map->stripe_len; 2812 *stripe_len = map->stripe_len;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9bb5c8750736..fc44d316d0bb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2452,10 +2452,10 @@ try_mount_again:
2452 tcon->local_lease = volume_info->local_lease; 2452 tcon->local_lease = volume_info->local_lease;
2453 } 2453 }
2454 if (pSesInfo) { 2454 if (pSesInfo) {
2455 if (pSesInfo->capabilities & CAP_LARGE_FILES) { 2455 if (pSesInfo->capabilities & CAP_LARGE_FILES)
2456 sb->s_maxbytes = (u64) 1 << 63; 2456 sb->s_maxbytes = MAX_LFS_FILESIZE;
2457 } else 2457 else
2458 sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */ 2458 sb->s_maxbytes = MAX_NON_LFS;
2459 } 2459 }
2460 2460
2461 /* BB FIXME fix time_gran to be larger for LANMAN sessions */ 2461 /* BB FIXME fix time_gran to be larger for LANMAN sessions */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 18afe57b2461..82d83839655e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -212,7 +212,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
212 * junction to the new submount (ie to setup the fake directory 212 * junction to the new submount (ie to setup the fake directory
213 * which represents a DFS referral). 213 * which represents a DFS referral).
214 */ 214 */
215void 215static void
216cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) 216cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
217{ 217{
218 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 218 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -388,7 +388,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
388} 388}
389 389
390/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ 390/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
391void 391static void
392cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, 392cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
393 struct cifs_sb_info *cifs_sb, bool adjust_tz) 393 struct cifs_sb_info *cifs_sb, bool adjust_tz)
394{ 394{
@@ -513,9 +513,12 @@ int cifs_get_inode_info(struct inode **pinode,
513 cifs_sb->mnt_cifs_flags & 513 cifs_sb->mnt_cifs_flags &
514 CIFS_MOUNT_MAP_SPECIAL_CHR); 514 CIFS_MOUNT_MAP_SPECIAL_CHR);
515 if (rc1) { 515 if (rc1) {
516 /* BB EOPNOSUPP disable SERVER_INUM? */
517 cFYI(1, ("GetSrvInodeNum rc %d", rc1)); 516 cFYI(1, ("GetSrvInodeNum rc %d", rc1));
518 fattr.cf_uniqueid = iunique(sb, ROOT_I); 517 fattr.cf_uniqueid = iunique(sb, ROOT_I);
518 /* disable serverino if call not supported */
519 if (rc1 == -EINVAL)
520 cifs_sb->mnt_cifs_flags &=
521 ~CIFS_MOUNT_SERVER_INUM;
519 } 522 }
520 } else { 523 } else {
521 fattr.cf_uniqueid = iunique(sb, ROOT_I); 524 fattr.cf_uniqueid = iunique(sb, ROOT_I);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index af737bb56cb7..259525c9abb8 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1303,6 +1303,13 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
1303 } 1303 }
1304 (*new_auth_tok)->session_key.encrypted_key_size = 1304 (*new_auth_tok)->session_key.encrypted_key_size =
1305 (body_size - (ECRYPTFS_SALT_SIZE + 5)); 1305 (body_size - (ECRYPTFS_SALT_SIZE + 5));
1306 if ((*new_auth_tok)->session_key.encrypted_key_size
1307 > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
1308 printk(KERN_WARNING "Tag 3 packet contains key larger "
1309 "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
1310 rc = -EINVAL;
1311 goto out_free;
1312 }
1306 if (unlikely(data[(*packet_size)++] != 0x04)) { 1313 if (unlikely(data[(*packet_size)++] != 0x04)) {
1307 printk(KERN_WARNING "Unknown version number [%d]\n", 1314 printk(KERN_WARNING "Unknown version number [%d]\n",
1308 data[(*packet_size) - 1]); 1315 data[(*packet_size) - 1]);
@@ -1449,6 +1456,12 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents,
1449 rc = -EINVAL; 1456 rc = -EINVAL;
1450 goto out; 1457 goto out;
1451 } 1458 }
1459 if (unlikely((*tag_11_contents_size) > max_contents_bytes)) {
1460 printk(KERN_ERR "Literal data section in tag 11 packet exceeds "
1461 "expected size\n");
1462 rc = -EINVAL;
1463 goto out;
1464 }
1452 if (data[(*packet_size)++] != 0x62) { 1465 if (data[(*packet_size)++] != 0x62) {
1453 printk(KERN_WARNING "Unrecognizable packet\n"); 1466 printk(KERN_WARNING "Unrecognizable packet\n");
1454 rc = -EINVAL; 1467 rc = -EINVAL;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 3d724a95882f..373fa90c796a 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp,
130 struct buffer_head *bh = NULL; 130 struct buffer_head *bh = NULL;
131 131
132 map_bh.b_state = 0; 132 map_bh.b_state = 0;
133 err = ext3_get_blocks_handle(NULL, inode, blk, 1, 133 err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
134 &map_bh, 0, 0);
135 if (err > 0) { 134 if (err > 0) {
136 pgoff_t index = map_bh.b_blocknr >> 135 pgoff_t index = map_bh.b_blocknr >>
137 (PAGE_CACHE_SHIFT - inode->i_blkbits); 136 (PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5f51fed5c750..b49908a167ae 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -788,7 +788,7 @@ err_out:
788int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, 788int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
789 sector_t iblock, unsigned long maxblocks, 789 sector_t iblock, unsigned long maxblocks,
790 struct buffer_head *bh_result, 790 struct buffer_head *bh_result,
791 int create, int extend_disksize) 791 int create)
792{ 792{
793 int err = -EIO; 793 int err = -EIO;
794 int offsets[4]; 794 int offsets[4];
@@ -911,13 +911,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
911 if (!err) 911 if (!err)
912 err = ext3_splice_branch(handle, inode, iblock, 912 err = ext3_splice_branch(handle, inode, iblock,
913 partial, indirect_blks, count); 913 partial, indirect_blks, count);
914 /*
915 * i_disksize growing is protected by truncate_mutex. Don't forget to
916 * protect it if you're about to implement concurrent
917 * ext3_get_block() -bzzz
918 */
919 if (!err && extend_disksize && inode->i_size > ei->i_disksize)
920 ei->i_disksize = inode->i_size;
921 mutex_unlock(&ei->truncate_mutex); 914 mutex_unlock(&ei->truncate_mutex);
922 if (err) 915 if (err)
923 goto cleanup; 916 goto cleanup;
@@ -972,7 +965,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
972 } 965 }
973 966
974 ret = ext3_get_blocks_handle(handle, inode, iblock, 967 ret = ext3_get_blocks_handle(handle, inode, iblock,
975 max_blocks, bh_result, create, 0); 968 max_blocks, bh_result, create);
976 if (ret > 0) { 969 if (ret > 0) {
977 bh_result->b_size = (ret << inode->i_blkbits); 970 bh_result->b_size = (ret << inode->i_blkbits);
978 ret = 0; 971 ret = 0;
@@ -1005,7 +998,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
1005 dummy.b_blocknr = -1000; 998 dummy.b_blocknr = -1000;
1006 buffer_trace_init(&dummy.b_history); 999 buffer_trace_init(&dummy.b_history);
1007 err = ext3_get_blocks_handle(handle, inode, block, 1, 1000 err = ext3_get_blocks_handle(handle, inode, block, 1,
1008 &dummy, create, 1); 1001 &dummy, create);
1009 /* 1002 /*
1010 * ext3_get_blocks_handle() returns number of blocks 1003 * ext3_get_blocks_handle() returns number of blocks
1011 * mapped. 0 in case of a HOLE. 1004 * mapped. 0 in case of a HOLE.
@@ -1193,15 +1186,16 @@ write_begin_failed:
1193 * i_size_read because we hold i_mutex. 1186 * i_size_read because we hold i_mutex.
1194 * 1187 *
1195 * Add inode to orphan list in case we crash before truncate 1188 * Add inode to orphan list in case we crash before truncate
1196 * finishes. 1189 * finishes. Do this only if ext3_can_truncate() agrees so
1190 * that orphan processing code is happy.
1197 */ 1191 */
1198 if (pos + len > inode->i_size) 1192 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1199 ext3_orphan_add(handle, inode); 1193 ext3_orphan_add(handle, inode);
1200 ext3_journal_stop(handle); 1194 ext3_journal_stop(handle);
1201 unlock_page(page); 1195 unlock_page(page);
1202 page_cache_release(page); 1196 page_cache_release(page);
1203 if (pos + len > inode->i_size) 1197 if (pos + len > inode->i_size)
1204 vmtruncate(inode, inode->i_size); 1198 ext3_truncate(inode);
1205 } 1199 }
1206 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) 1200 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1207 goto retry; 1201 goto retry;
@@ -1287,7 +1281,7 @@ static int ext3_ordered_write_end(struct file *file,
1287 * There may be allocated blocks outside of i_size because 1281 * There may be allocated blocks outside of i_size because
1288 * we failed to copy some data. Prepare for truncate. 1282 * we failed to copy some data. Prepare for truncate.
1289 */ 1283 */
1290 if (pos + len > inode->i_size) 1284 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1291 ext3_orphan_add(handle, inode); 1285 ext3_orphan_add(handle, inode);
1292 ret2 = ext3_journal_stop(handle); 1286 ret2 = ext3_journal_stop(handle);
1293 if (!ret) 1287 if (!ret)
@@ -1296,7 +1290,7 @@ static int ext3_ordered_write_end(struct file *file,
1296 page_cache_release(page); 1290 page_cache_release(page);
1297 1291
1298 if (pos + len > inode->i_size) 1292 if (pos + len > inode->i_size)
1299 vmtruncate(inode, inode->i_size); 1293 ext3_truncate(inode);
1300 return ret ? ret : copied; 1294 return ret ? ret : copied;
1301} 1295}
1302 1296
@@ -1315,14 +1309,14 @@ static int ext3_writeback_write_end(struct file *file,
1315 * There may be allocated blocks outside of i_size because 1309 * There may be allocated blocks outside of i_size because
1316 * we failed to copy some data. Prepare for truncate. 1310 * we failed to copy some data. Prepare for truncate.
1317 */ 1311 */
1318 if (pos + len > inode->i_size) 1312 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1319 ext3_orphan_add(handle, inode); 1313 ext3_orphan_add(handle, inode);
1320 ret = ext3_journal_stop(handle); 1314 ret = ext3_journal_stop(handle);
1321 unlock_page(page); 1315 unlock_page(page);
1322 page_cache_release(page); 1316 page_cache_release(page);
1323 1317
1324 if (pos + len > inode->i_size) 1318 if (pos + len > inode->i_size)
1325 vmtruncate(inode, inode->i_size); 1319 ext3_truncate(inode);
1326 return ret ? ret : copied; 1320 return ret ? ret : copied;
1327} 1321}
1328 1322
@@ -1358,7 +1352,7 @@ static int ext3_journalled_write_end(struct file *file,
1358 * There may be allocated blocks outside of i_size because 1352 * There may be allocated blocks outside of i_size because
1359 * we failed to copy some data. Prepare for truncate. 1353 * we failed to copy some data. Prepare for truncate.
1360 */ 1354 */
1361 if (pos + len > inode->i_size) 1355 if (pos + len > inode->i_size && ext3_can_truncate(inode))
1362 ext3_orphan_add(handle, inode); 1356 ext3_orphan_add(handle, inode);
1363 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; 1357 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1364 if (inode->i_size > EXT3_I(inode)->i_disksize) { 1358 if (inode->i_size > EXT3_I(inode)->i_disksize) {
@@ -1375,7 +1369,7 @@ static int ext3_journalled_write_end(struct file *file,
1375 page_cache_release(page); 1369 page_cache_release(page);
1376 1370
1377 if (pos + len > inode->i_size) 1371 if (pos + len > inode->i_size)
1378 vmtruncate(inode, inode->i_size); 1372 ext3_truncate(inode);
1379 return ret ? ret : copied; 1373 return ret ? ret : copied;
1380} 1374}
1381 1375
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 737f7246a4b5..f96f85092d1c 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
287 struct page *new_page; 287 struct page *new_page;
288 unsigned int new_offset; 288 unsigned int new_offset;
289 struct buffer_head *bh_in = jh2bh(jh_in); 289 struct buffer_head *bh_in = jh2bh(jh_in);
290 journal_t *journal = transaction->t_journal;
290 291
291 /* 292 /*
292 * The buffer really shouldn't be locked: only the current committing 293 * The buffer really shouldn't be locked: only the current committing
@@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction,
300 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); 301 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
301 302
302 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); 303 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
304 /* keep subsequent assertions sane */
305 new_bh->b_state = 0;
306 init_buffer(new_bh, NULL, NULL);
307 atomic_set(&new_bh->b_count, 1);
308 new_jh = journal_add_journal_head(new_bh); /* This sleeps */
303 309
304 /* 310 /*
305 * If a new transaction has already done a buffer copy-out, then 311 * If a new transaction has already done a buffer copy-out, then
@@ -361,14 +367,6 @@ repeat:
361 kunmap_atomic(mapped_data, KM_USER0); 367 kunmap_atomic(mapped_data, KM_USER0);
362 } 368 }
363 369
364 /* keep subsequent assertions sane */
365 new_bh->b_state = 0;
366 init_buffer(new_bh, NULL, NULL);
367 atomic_set(&new_bh->b_count, 1);
368 jbd_unlock_bh_state(bh_in);
369
370 new_jh = journal_add_journal_head(new_bh); /* This sleeps */
371
372 set_bh_page(new_bh, new_page, new_offset); 370 set_bh_page(new_bh, new_page, new_offset);
373 new_jh->b_transaction = NULL; 371 new_jh->b_transaction = NULL;
374 new_bh->b_size = jh2bh(jh_in)->b_size; 372 new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -385,7 +383,11 @@ repeat:
385 * copying is moved to the transaction's shadow queue. 383 * copying is moved to the transaction's shadow queue.
386 */ 384 */
387 JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); 385 JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
388 journal_file_buffer(jh_in, transaction, BJ_Shadow); 386 spin_lock(&journal->j_list_lock);
387 __journal_file_buffer(jh_in, transaction, BJ_Shadow);
388 spin_unlock(&journal->j_list_lock);
389 jbd_unlock_bh_state(bh_in);
390
389 JBUFFER_TRACE(new_jh, "file as BJ_IO"); 391 JBUFFER_TRACE(new_jh, "file as BJ_IO");
390 journal_file_buffer(new_jh, transaction, BJ_IO); 392 journal_file_buffer(new_jh, transaction, BJ_IO);
391 393
@@ -848,6 +850,12 @@ static int journal_reset(journal_t *journal)
848 850
849 first = be32_to_cpu(sb->s_first); 851 first = be32_to_cpu(sb->s_first);
850 last = be32_to_cpu(sb->s_maxlen); 852 last = be32_to_cpu(sb->s_maxlen);
853 if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
854 printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n",
855 first, last);
856 journal_fail_superblock(journal);
857 return -EINVAL;
858 }
851 859
852 journal->j_first = first; 860 journal->j_first = first;
853 journal->j_last = last; 861 journal->j_last = last;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 73242ba7c7b1..c03ac11f74be 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -489,34 +489,15 @@ void journal_unlock_updates (journal_t *journal)
489 wake_up(&journal->j_wait_transaction_locked); 489 wake_up(&journal->j_wait_transaction_locked);
490} 490}
491 491
492/* 492static void warn_dirty_buffer(struct buffer_head *bh)
493 * Report any unexpected dirty buffers which turn up. Normally those
494 * indicate an error, but they can occur if the user is running (say)
495 * tune2fs to modify the live filesystem, so we need the option of
496 * continuing as gracefully as possible. #
497 *
498 * The caller should already hold the journal lock and
499 * j_list_lock spinlock: most callers will need those anyway
500 * in order to probe the buffer's journaling state safely.
501 */
502static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
503{ 493{
504 int jlist; 494 char b[BDEVNAME_SIZE];
505
506 /* If this buffer is one which might reasonably be dirty
507 * --- ie. data, or not part of this journal --- then
508 * we're OK to leave it alone, but otherwise we need to
509 * move the dirty bit to the journal's own internal
510 * JBDDirty bit. */
511 jlist = jh->b_jlist;
512 495
513 if (jlist == BJ_Metadata || jlist == BJ_Reserved || 496 printk(KERN_WARNING
514 jlist == BJ_Shadow || jlist == BJ_Forget) { 497 "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
515 struct buffer_head *bh = jh2bh(jh); 498 "There's a risk of filesystem corruption in case of system "
516 499 "crash.\n",
517 if (test_clear_buffer_dirty(bh)) 500 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
518 set_buffer_jbddirty(bh);
519 }
520} 501}
521 502
522/* 503/*
@@ -583,14 +564,16 @@ repeat:
583 if (jh->b_next_transaction) 564 if (jh->b_next_transaction)
584 J_ASSERT_JH(jh, jh->b_next_transaction == 565 J_ASSERT_JH(jh, jh->b_next_transaction ==
585 transaction); 566 transaction);
567 warn_dirty_buffer(bh);
586 } 568 }
587 /* 569 /*
588 * In any case we need to clean the dirty flag and we must 570 * In any case we need to clean the dirty flag and we must
589 * do it under the buffer lock to be sure we don't race 571 * do it under the buffer lock to be sure we don't race
590 * with running write-out. 572 * with running write-out.
591 */ 573 */
592 JBUFFER_TRACE(jh, "Unexpected dirty buffer"); 574 JBUFFER_TRACE(jh, "Journalling dirty buffer");
593 jbd_unexpected_dirty_buffer(jh); 575 clear_buffer_dirty(bh);
576 set_buffer_jbddirty(bh);
594 } 577 }
595 578
596 unlock_buffer(bh); 579 unlock_buffer(bh);
@@ -826,6 +809,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
826 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); 809 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
827 810
828 if (jh->b_transaction == NULL) { 811 if (jh->b_transaction == NULL) {
812 /*
813 * Previous journal_forget() could have left the buffer
814 * with jbddirty bit set because it was being committed. When
815 * the commit finished, we've filed the buffer for
816 * checkpointing and marked it dirty. Now we are reallocating
817 * the buffer so the transaction freeing it must have
818 * committed and so it's safe to clear the dirty bit.
819 */
820 clear_buffer_dirty(jh2bh(jh));
829 jh->b_transaction = transaction; 821 jh->b_transaction = transaction;
830 822
831 /* first access by this transaction */ 823 /* first access by this transaction */
@@ -1782,8 +1774,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
1782 1774
1783 if (jh->b_cp_transaction) { 1775 if (jh->b_cp_transaction) {
1784 JBUFFER_TRACE(jh, "on running+cp transaction"); 1776 JBUFFER_TRACE(jh, "on running+cp transaction");
1777 /*
1778 * We don't want to write the buffer anymore, clear the
1779 * bit so that we don't confuse checks in
1780 * __journal_file_buffer
1781 */
1782 clear_buffer_dirty(bh);
1785 __journal_file_buffer(jh, transaction, BJ_Forget); 1783 __journal_file_buffer(jh, transaction, BJ_Forget);
1786 clear_buffer_jbddirty(bh);
1787 may_free = 0; 1784 may_free = 0;
1788 } else { 1785 } else {
1789 JBUFFER_TRACE(jh, "on running transaction"); 1786 JBUFFER_TRACE(jh, "on running transaction");
@@ -2041,12 +2038,17 @@ void __journal_file_buffer(struct journal_head *jh,
2041 if (jh->b_transaction && jh->b_jlist == jlist) 2038 if (jh->b_transaction && jh->b_jlist == jlist)
2042 return; 2039 return;
2043 2040
2044 /* The following list of buffer states needs to be consistent
2045 * with __jbd_unexpected_dirty_buffer()'s handling of dirty
2046 * state. */
2047
2048 if (jlist == BJ_Metadata || jlist == BJ_Reserved || 2041 if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
2049 jlist == BJ_Shadow || jlist == BJ_Forget) { 2042 jlist == BJ_Shadow || jlist == BJ_Forget) {
2043 /*
2044 * For metadata buffers, we track dirty bit in buffer_jbddirty
2045 * instead of buffer_dirty. We should not see a dirty bit set
2046 * here because we clear it in do_get_write_access but e.g.
2047 * tune2fs can modify the sb and set the dirty bit at any time
2048 * so we try to gracefully handle that.
2049 */
2050 if (buffer_dirty(bh))
2051 warn_dirty_buffer(bh);
2050 if (test_clear_buffer_dirty(bh) || 2052 if (test_clear_buffer_dirty(bh) ||
2051 test_clear_buffer_jbddirty(bh)) 2053 test_clear_buffer_jbddirty(bh))
2052 was_dirty = 1; 2054 was_dirty = 1;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 91fa3ad6e8c2..a29c7c3e3fb8 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -67,10 +67,8 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
67 acl = posix_acl_from_xattr(value, size); 67 acl = posix_acl_from_xattr(value, size);
68 } 68 }
69 kfree(value); 69 kfree(value);
70 if (!IS_ERR(acl)) { 70 if (!IS_ERR(acl))
71 set_cached_acl(inode, type, acl); 71 set_cached_acl(inode, type, acl);
72 posix_acl_release(acl);
73 }
74 return acl; 72 return acl;
75} 73}
76 74
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index c2d061675d80..8d25ccb2d51d 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1242,20 +1242,6 @@ error:
1242 return error; 1242 return error;
1243} 1243}
1244 1244
1245/*
1246 * Initialize a session.
1247 * Note: save the mount rsize and wsize for create_server negotiation.
1248 */
1249static void nfs4_init_session(struct nfs_client *clp,
1250 unsigned int wsize, unsigned int rsize)
1251{
1252#if defined(CONFIG_NFS_V4_1)
1253 if (nfs4_has_session(clp)) {
1254 clp->cl_session->fc_attrs.max_rqst_sz = wsize;
1255 clp->cl_session->fc_attrs.max_resp_sz = rsize;
1256 }
1257#endif /* CONFIG_NFS_V4_1 */
1258}
1259 1245
1260/* 1246/*
1261 * Session has been established, and the client marked ready. 1247 * Session has been established, and the client marked ready.
@@ -1350,7 +1336,9 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1350 BUG_ON(!server->nfs_client->rpc_ops); 1336 BUG_ON(!server->nfs_client->rpc_ops);
1351 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); 1337 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1352 1338
1353 nfs4_init_session(server->nfs_client, server->wsize, server->rsize); 1339 error = nfs4_init_session(server);
1340 if (error < 0)
1341 goto error;
1354 1342
1355 /* Probe the root fh to retrieve its FSID */ 1343 /* Probe the root fh to retrieve its FSID */
1356 error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path); 1344 error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 38d42c29fb92..32062c33c859 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1025,12 +1025,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
1025 res = NULL; 1025 res = NULL;
1026 goto out; 1026 goto out;
1027 /* This turned out not to be a regular file */ 1027 /* This turned out not to be a regular file */
1028 case -EISDIR:
1029 case -ENOTDIR: 1028 case -ENOTDIR:
1030 goto no_open; 1029 goto no_open;
1031 case -ELOOP: 1030 case -ELOOP:
1032 if (!(nd->intent.open.flags & O_NOFOLLOW)) 1031 if (!(nd->intent.open.flags & O_NOFOLLOW))
1033 goto no_open; 1032 goto no_open;
1033 /* case -EISDIR: */
1034 /* case -EINVAL: */ 1034 /* case -EINVAL: */
1035 default: 1035 default:
1036 goto out; 1036 goto out;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 61bc3a32e1e2..6ea07a3c75d4 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -220,6 +220,7 @@ extern void nfs4_destroy_session(struct nfs4_session *session);
220extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); 220extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
221extern int nfs4_proc_create_session(struct nfs_client *, int reset); 221extern int nfs4_proc_create_session(struct nfs_client *, int reset);
222extern int nfs4_proc_destroy_session(struct nfs4_session *); 222extern int nfs4_proc_destroy_session(struct nfs4_session *);
223extern int nfs4_init_session(struct nfs_server *server);
223#else /* CONFIG_NFS_v4_1 */ 224#else /* CONFIG_NFS_v4_1 */
224static inline int nfs4_setup_sequence(struct nfs_client *clp, 225static inline int nfs4_setup_sequence(struct nfs_client *clp,
225 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 226 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
@@ -227,6 +228,11 @@ static inline int nfs4_setup_sequence(struct nfs_client *clp,
227{ 228{
228 return 0; 229 return 0;
229} 230}
231
232static inline int nfs4_init_session(struct nfs_server *server)
233{
234 return 0;
235}
230#endif /* CONFIG_NFS_V4_1 */ 236#endif /* CONFIG_NFS_V4_1 */
231 237
232extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; 238extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[];
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ff0c080db59b..6917311f201c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2040,15 +2040,9 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
2040 .rpc_argp = &args, 2040 .rpc_argp = &args,
2041 .rpc_resp = &res, 2041 .rpc_resp = &res,
2042 }; 2042 };
2043 int status;
2044 2043
2045 nfs_fattr_init(info->fattr); 2044 nfs_fattr_init(info->fattr);
2046 status = nfs4_recover_expired_lease(server); 2045 return nfs4_call_sync(server, &msg, &args, &res, 0);
2047 if (!status)
2048 status = nfs4_check_client_ready(server->nfs_client);
2049 if (!status)
2050 status = nfs4_call_sync(server, &msg, &args, &res, 0);
2051 return status;
2052} 2046}
2053 2047
2054static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, 2048static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -4099,15 +4093,23 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
4099 if (request->fl_start < 0 || request->fl_end < 0) 4093 if (request->fl_start < 0 || request->fl_end < 0)
4100 return -EINVAL; 4094 return -EINVAL;
4101 4095
4102 if (IS_GETLK(cmd)) 4096 if (IS_GETLK(cmd)) {
4103 return nfs4_proc_getlk(state, F_GETLK, request); 4097 if (state != NULL)
4098 return nfs4_proc_getlk(state, F_GETLK, request);
4099 return 0;
4100 }
4104 4101
4105 if (!(IS_SETLK(cmd) || IS_SETLKW(cmd))) 4102 if (!(IS_SETLK(cmd) || IS_SETLKW(cmd)))
4106 return -EINVAL; 4103 return -EINVAL;
4107 4104
4108 if (request->fl_type == F_UNLCK) 4105 if (request->fl_type == F_UNLCK) {
4109 return nfs4_proc_unlck(state, cmd, request); 4106 if (state != NULL)
4107 return nfs4_proc_unlck(state, cmd, request);
4108 return 0;
4109 }
4110 4110
4111 if (state == NULL)
4112 return -ENOLCK;
4111 do { 4113 do {
4112 status = nfs4_proc_setlk(state, cmd, request); 4114 status = nfs4_proc_setlk(state, cmd, request);
4113 if ((status != -EAGAIN) || IS_SETLK(cmd)) 4115 if ((status != -EAGAIN) || IS_SETLK(cmd))
@@ -4793,6 +4795,22 @@ int nfs4_proc_destroy_session(struct nfs4_session *session)
4793 return status; 4795 return status;
4794} 4796}
4795 4797
4798int nfs4_init_session(struct nfs_server *server)
4799{
4800 struct nfs_client *clp = server->nfs_client;
4801 int ret;
4802
4803 if (!nfs4_has_session(clp))
4804 return 0;
4805
4806 clp->cl_session->fc_attrs.max_rqst_sz = server->wsize;
4807 clp->cl_session->fc_attrs.max_resp_sz = server->rsize;
4808 ret = nfs4_recover_expired_lease(server);
4809 if (!ret)
4810 ret = nfs4_check_client_ready(clp);
4811 return ret;
4812}
4813
4796/* 4814/*
4797 * Renew the cl_session lease. 4815 * Renew the cl_session lease.
4798 */ 4816 */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index b73c5a728655..65ca8c18476f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -553,6 +553,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
553 INIT_LIST_HEAD(&lsp->ls_sequence.list); 553 INIT_LIST_HEAD(&lsp->ls_sequence.list);
554 lsp->ls_seqid.sequence = &lsp->ls_sequence; 554 lsp->ls_seqid.sequence = &lsp->ls_sequence;
555 atomic_set(&lsp->ls_count, 1); 555 atomic_set(&lsp->ls_count, 1);
556 lsp->ls_state = state;
556 lsp->ls_owner = fl_owner; 557 lsp->ls_owner = fl_owner;
557 spin_lock(&clp->cl_lock); 558 spin_lock(&clp->cl_lock);
558 nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); 559 nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
@@ -587,7 +588,6 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
587 if (lsp != NULL) 588 if (lsp != NULL)
588 break; 589 break;
589 if (new != NULL) { 590 if (new != NULL) {
590 new->ls_state = state;
591 list_add(&new->ls_locks, &state->lock_states); 591 list_add(&new->ls_locks, &state->lock_states);
592 set_bit(LK_STATE_IN_USE, &state->flags); 592 set_bit(LK_STATE_IN_USE, &state->flags);
593 lsp = new; 593 lsp = new;
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
new file mode 100644
index 000000000000..72da095d4009
--- /dev/null
+++ b/fs/nilfs2/Kconfig
@@ -0,0 +1,25 @@
1config NILFS2_FS
2 tristate "NILFS2 file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL
4 select CRC32
5 help
6 NILFS2 is a log-structured file system (LFS) supporting continuous
7 snapshotting. In addition to versioning capability of the entire
8 file system, users can even restore files mistakenly overwritten or
9 destroyed just a few seconds ago. Since this file system can keep
10 consistency like conventional LFS, it achieves quick recovery after
11 system crashes.
12
13 NILFS2 creates a number of checkpoints every few seconds or per
14 synchronous write basis (unless there is no change). Users can
15 select significant versions among continuously created checkpoints,
16 and can change them into snapshots which will be preserved for long
17 periods until they are changed back to checkpoints. Each
18 snapshot is mountable as a read-only file system concurrently with
19 its writable mount, and this feature is convenient for online backup.
20
21 Some features including atime, extended attributes, and POSIX ACLs,
22 are not supported yet.
23
24 To compile this file system support as a module, choose M here: the
25 module will be called nilfs2. If unsure, say N.
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 31dac7e3b0f1..dffbb0911d02 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,15 +1,5 @@
1config FSNOTIFY 1config FSNOTIFY
2 bool "Filesystem notification backend" 2 def_bool n
3 default y
4 ---help---
5 fsnotify is a backend for filesystem notification. fsnotify does
6 not provide any userspace interface but does provide the basis
7 needed for other notification schemes such as dnotify, inotify,
8 and fanotify.
9
10 Say Y here to enable fsnotify suport.
11
12 If unsure, say Y.
13 3
14source "fs/notify/dnotify/Kconfig" 4source "fs/notify/dnotify/Kconfig"
15source "fs/notify/inotify/Kconfig" 5source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 904ff8d5405a..f9c1ca139d8f 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,6 +1,6 @@
1config DNOTIFY 1config DNOTIFY
2 bool "Dnotify support" 2 bool "Dnotify support"
3 depends on FSNOTIFY 3 select FSNOTIFY
4 default y 4 default y
5 help 5 help
6 Dnotify is a directory-based per-fd file change notification system 6 Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ec2f7bd76818..037e878e03fc 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -159,7 +159,9 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
159 if (!group->ops->should_send_event(group, to_tell, mask)) 159 if (!group->ops->should_send_event(group, to_tell, mask))
160 continue; 160 continue;
161 if (!event) { 161 if (!event) {
162 event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie); 162 event = fsnotify_create_event(to_tell, mask, data,
163 data_is, file_name, cookie,
164 GFP_KERNEL);
163 /* shit, we OOM'd and now we can't tell, maybe 165 /* shit, we OOM'd and now we can't tell, maybe
164 * someday someone else will want to do something 166 * someday someone else will want to do something
165 * here */ 167 * here */
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 5356884289a1..3e56dbffe729 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -15,7 +15,7 @@ config INOTIFY
15 15
16config INOTIFY_USER 16config INOTIFY_USER
17 bool "Inotify support for userspace" 17 bool "Inotify support for userspace"
18 depends on FSNOTIFY 18 select FSNOTIFY
19 default y 19 default y
20 ---help--- 20 ---help---
21 Say Y here to enable inotify support for userspace, including the 21 Say Y here to enable inotify support for userspace, including the
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ff27a2965844..f30d9bbc2e1b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -57,7 +57,6 @@ int inotify_max_user_watches __read_mostly;
57 57
58static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; 58static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
59struct kmem_cache *event_priv_cachep __read_mostly; 59struct kmem_cache *event_priv_cachep __read_mostly;
60static struct fsnotify_event *inotify_ignored_event;
61 60
62/* 61/*
63 * When inotify registers a new group it increments this and uses that 62 * When inotify registers a new group it increments this and uses that
@@ -365,6 +364,17 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
365 return error; 364 return error;
366} 365}
367 366
367static void inotify_remove_from_idr(struct fsnotify_group *group,
368 struct inotify_inode_mark_entry *ientry)
369{
370 struct idr *idr;
371
372 spin_lock(&group->inotify_data.idr_lock);
373 idr = &group->inotify_data.idr;
374 idr_remove(idr, ientry->wd);
375 spin_unlock(&group->inotify_data.idr_lock);
376 ientry->wd = -1;
377}
368/* 378/*
369 * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the 379 * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the
370 * internal reference help on the mark because it is in the idr. 380 * internal reference help on the mark because it is in the idr.
@@ -373,13 +383,19 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
373 struct fsnotify_group *group) 383 struct fsnotify_group *group)
374{ 384{
375 struct inotify_inode_mark_entry *ientry; 385 struct inotify_inode_mark_entry *ientry;
386 struct fsnotify_event *ignored_event;
376 struct inotify_event_private_data *event_priv; 387 struct inotify_event_private_data *event_priv;
377 struct fsnotify_event_private_data *fsn_event_priv; 388 struct fsnotify_event_private_data *fsn_event_priv;
378 struct idr *idr; 389
390 ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
391 FSNOTIFY_EVENT_NONE, NULL, 0,
392 GFP_NOFS);
393 if (!ignored_event)
394 return;
379 395
380 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 396 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
381 397
382 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); 398 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
383 if (unlikely(!event_priv)) 399 if (unlikely(!event_priv))
384 goto skip_send_ignore; 400 goto skip_send_ignore;
385 401
@@ -388,7 +404,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
388 fsn_event_priv->group = group; 404 fsn_event_priv->group = group;
389 event_priv->wd = ientry->wd; 405 event_priv->wd = ientry->wd;
390 406
391 fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv); 407 fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
392 408
393 /* did the private data get added? */ 409 /* did the private data get added? */
394 if (list_empty(&fsn_event_priv->event_list)) 410 if (list_empty(&fsn_event_priv->event_list))
@@ -396,14 +412,16 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
396 412
397skip_send_ignore: 413skip_send_ignore:
398 414
415 /* matches the reference taken when the event was created */
416 fsnotify_put_event(ignored_event);
417
399 /* remove this entry from the idr */ 418 /* remove this entry from the idr */
400 spin_lock(&group->inotify_data.idr_lock); 419 inotify_remove_from_idr(group, ientry);
401 idr = &group->inotify_data.idr;
402 idr_remove(idr, ientry->wd);
403 spin_unlock(&group->inotify_data.idr_lock);
404 420
405 /* removed from idr, drop that reference */ 421 /* removed from idr, drop that reference */
406 fsnotify_put_mark(entry); 422 fsnotify_put_mark(entry);
423
424 atomic_dec(&group->inotify_data.user->inotify_watches);
407} 425}
408 426
409/* ding dong the mark is dead */ 427/* ding dong the mark is dead */
@@ -418,6 +436,7 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
418{ 436{
419 struct fsnotify_mark_entry *entry = NULL; 437 struct fsnotify_mark_entry *entry = NULL;
420 struct inotify_inode_mark_entry *ientry; 438 struct inotify_inode_mark_entry *ientry;
439 struct inotify_inode_mark_entry *tmp_ientry;
421 int ret = 0; 440 int ret = 0;
422 int add = (arg & IN_MASK_ADD); 441 int add = (arg & IN_MASK_ADD);
423 __u32 mask; 442 __u32 mask;
@@ -428,54 +447,66 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
428 if (unlikely(!mask)) 447 if (unlikely(!mask))
429 return -EINVAL; 448 return -EINVAL;
430 449
431 ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); 450 tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
432 if (unlikely(!ientry)) 451 if (unlikely(!tmp_ientry))
433 return -ENOMEM; 452 return -ENOMEM;
434 /* we set the mask at the end after attaching it */ 453 /* we set the mask at the end after attaching it */
435 fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark); 454 fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
436 ientry->wd = 0; 455 tmp_ientry->wd = -1;
437 456
438find_entry: 457find_entry:
439 spin_lock(&inode->i_lock); 458 spin_lock(&inode->i_lock);
440 entry = fsnotify_find_mark_entry(group, inode); 459 entry = fsnotify_find_mark_entry(group, inode);
441 spin_unlock(&inode->i_lock); 460 spin_unlock(&inode->i_lock);
442 if (entry) { 461 if (entry) {
443 kmem_cache_free(inotify_inode_mark_cachep, ientry);
444 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); 462 ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
445 } else { 463 } else {
446 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) { 464 ret = -ENOSPC;
447 ret = -ENOSPC; 465 if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
448 goto out_err;
449 }
450
451 ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
452 if (ret == -EEXIST)
453 goto find_entry;
454 else if (ret)
455 goto out_err; 466 goto out_err;
456
457 entry = &ientry->fsn_entry;
458retry: 467retry:
459 ret = -ENOMEM; 468 ret = -ENOMEM;
460 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) 469 if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
461 goto out_err; 470 goto out_err;
462 471
463 spin_lock(&group->inotify_data.idr_lock); 472 spin_lock(&group->inotify_data.idr_lock);
464 /* if entry is added to the idr we keep the reference obtained 473 ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
465 * through fsnotify_mark_add. remember to drop this reference 474 group->inotify_data.last_wd,
466 * when entry is removed from idr */ 475 &tmp_ientry->wd);
467 ret = idr_get_new_above(&group->inotify_data.idr, entry,
468 ++group->inotify_data.last_wd,
469 &ientry->wd);
470 spin_unlock(&group->inotify_data.idr_lock); 476 spin_unlock(&group->inotify_data.idr_lock);
471 if (ret) { 477 if (ret) {
472 if (ret == -EAGAIN) 478 if (ret == -EAGAIN)
473 goto retry; 479 goto retry;
474 goto out_err; 480 goto out_err;
475 } 481 }
482
483 ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
484 if (ret) {
485 inotify_remove_from_idr(group, tmp_ientry);
486 if (ret == -EEXIST)
487 goto find_entry;
488 goto out_err;
489 }
490
491 /* tmp_ientry has been added to the inode, so we are all set up.
492 * now we just need to make sure tmp_ientry doesn't get freed and
493 * we need to set up entry and ientry so the generic code can
494 * do its thing. */
495 ientry = tmp_ientry;
496 entry = &ientry->fsn_entry;
497 tmp_ientry = NULL;
498
476 atomic_inc(&group->inotify_data.user->inotify_watches); 499 atomic_inc(&group->inotify_data.user->inotify_watches);
500
501 /* update the idr hint */
502 group->inotify_data.last_wd = ientry->wd;
503
504 /* we put the mark on the idr, take a reference */
505 fsnotify_get_mark(entry);
477 } 506 }
478 507
508 ret = ientry->wd;
509
479 spin_lock(&entry->lock); 510 spin_lock(&entry->lock);
480 511
481 old_mask = entry->mask; 512 old_mask = entry->mask;
@@ -506,14 +537,19 @@ retry:
506 fsnotify_recalc_group_mask(group); 537 fsnotify_recalc_group_mask(group);
507 } 538 }
508 539
509 return ientry->wd; 540 /* this either matches fsnotify_find_mark_entry, or init_mark_entry
541 * depending on which path we took... */
542 fsnotify_put_mark(entry);
510 543
511out_err: 544out_err:
512 /* see this isn't supposed to happen, just kill the watch */ 545 /* could be an error, could be that we found an existing mark */
513 if (entry) { 546 if (tmp_ientry) {
514 fsnotify_destroy_mark_by_entry(entry); 547 /* on the idr but didn't make it on the inode */
515 fsnotify_put_mark(entry); 548 if (tmp_ientry->wd != -1)
549 inotify_remove_from_idr(group, tmp_ientry);
550 kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
516 } 551 }
552
517 return ret; 553 return ret;
518} 554}
519 555
@@ -721,9 +757,6 @@ static int __init inotify_user_setup(void)
721 757
722 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); 758 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
723 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); 759 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
724 inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
725 if (!inotify_ignored_event)
726 panic("unable to allocate the inotify ignored event\n");
727 760
728 inotify_max_queued_events = 16384; 761 inotify_max_queued_events = 16384;
729 inotify_max_user_instances = 128; 762 inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 959b73e756fd..521368574e97 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -136,18 +136,24 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
136{ 136{
137 if ((old->mask == new->mask) && 137 if ((old->mask == new->mask) &&
138 (old->to_tell == new->to_tell) && 138 (old->to_tell == new->to_tell) &&
139 (old->data_type == new->data_type)) { 139 (old->data_type == new->data_type) &&
140 (old->name_len == new->name_len)) {
140 switch (old->data_type) { 141 switch (old->data_type) {
141 case (FSNOTIFY_EVENT_INODE): 142 case (FSNOTIFY_EVENT_INODE):
142 if (old->inode == new->inode) 143 /* remember, after old was put on the wait_q we aren't
144 * allowed to look at the inode any more, only thing
145 * left to check was if the file_name is the same */
146 if (old->name_len &&
147 !strcmp(old->file_name, new->file_name))
143 return true; 148 return true;
144 break; 149 break;
145 case (FSNOTIFY_EVENT_PATH): 150 case (FSNOTIFY_EVENT_PATH):
146 if ((old->path.mnt == new->path.mnt) && 151 if ((old->path.mnt == new->path.mnt) &&
147 (old->path.dentry == new->path.dentry)) 152 (old->path.dentry == new->path.dentry))
148 return true; 153 return true;
154 break;
149 case (FSNOTIFY_EVENT_NONE): 155 case (FSNOTIFY_EVENT_NONE):
150 return true; 156 return false;
151 }; 157 };
152 } 158 }
153 return false; 159 return false;
@@ -339,18 +345,19 @@ static void initialize_event(struct fsnotify_event *event)
339 * @name the filename, if available 345 * @name the filename, if available
340 */ 346 */
341struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, 347struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
342 int data_type, const char *name, u32 cookie) 348 int data_type, const char *name, u32 cookie,
349 gfp_t gfp)
343{ 350{
344 struct fsnotify_event *event; 351 struct fsnotify_event *event;
345 352
346 event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); 353 event = kmem_cache_alloc(fsnotify_event_cachep, gfp);
347 if (!event) 354 if (!event)
348 return NULL; 355 return NULL;
349 356
350 initialize_event(event); 357 initialize_event(event);
351 358
352 if (name) { 359 if (name) {
353 event->file_name = kstrdup(name, GFP_KERNEL); 360 event->file_name = kstrdup(name, gfp);
354 if (!event->file_name) { 361 if (!event->file_name) {
355 kmem_cache_free(fsnotify_event_cachep, event); 362 kmem_cache_free(fsnotify_event_cachep, event);
356 return NULL; 363 return NULL;
diff --git a/fs/pipe.c b/fs/pipe.c
index f7dd21ad85a6..52c415114838 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -68,8 +68,8 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
68 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 68 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
69 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 69 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
70 } else { 70 } else {
71 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 71 pipe_lock_nested(pipe2, I_MUTEX_PARENT);
72 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 72 pipe_lock_nested(pipe1, I_MUTEX_CHILD);
73 } 73 }
74} 74}
75 75