aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-07-27 19:43:52 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-27 19:43:52 -0400
commit22712200e175e0df5c7f9edfe6c6bf5c94c23b83 (patch)
treea3e332aab7f5a953ff4f12e67af2a0e5f32f5be5 /fs/btrfs
parent597a67e0ba758e3d2239c81fbb648c6e69ec30a2 (diff)
parentff95acb6733d41a8d45feb0e18b96df25e610e78 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: Btrfs: make sure reserve_metadata_bytes doesn't leak out strange errors Btrfs: use the commit_root for reading free_space_inode crcs Btrfs: reduce extent_state lock contention for metadata Btrfs: remove lockdep magic from btrfs_next_leaf Btrfs: make a lockdep class for each root Btrfs: switch the btrfs tree locks to reader/writer Btrfs: fix deadlock when throttling transactions Btrfs: stop using highmem for extent_buffers Btrfs: fix BUG_ON() caused by ENOSPC when relocating space Btrfs: tag pages for writeback in sync Btrfs: fix enospc problems with delalloc Btrfs: don't flush delalloc arbitrarily Btrfs: use find_or_create_page instead of grab_cache_page Btrfs: use a worker thread to do caching Btrfs: fix how we merge extent states and deal with cached states Btrfs: use the normal checksumming infrastructure for free space cache Btrfs: serialize flushers in reserve_metadata_bytes Btrfs: do transaction space reservation before joining the transaction Btrfs: try to only do one btrfs_search_slot in do_setxattr
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/btrfs_inode.h16
-rw-r--r--fs/btrfs/ctree.c457
-rw-r--r--fs/btrfs/ctree.h14
-rw-r--r--fs/btrfs/delayed-inode.c2
-rw-r--r--fs/btrfs/dir-item.c9
-rw-r--r--fs/btrfs/disk-io.c116
-rw-r--r--fs/btrfs/disk-io.h10
-rw-r--r--fs/btrfs/extent-tree.c285
-rw-r--r--fs/btrfs/extent_io.c168
-rw-r--r--fs/btrfs/extent_io.h35
-rw-r--r--fs/btrfs/file-item.c41
-rw-r--r--fs/btrfs/file.c11
-rw-r--r--fs/btrfs/free-space-cache.c173
-rw-r--r--fs/btrfs/inode.c90
-rw-r--r--fs/btrfs/ioctl.c8
-rw-r--r--fs/btrfs/locking.c280
-rw-r--r--fs/btrfs/locking.h36
-rw-r--r--fs/btrfs/relocation.c3
-rw-r--r--fs/btrfs/struct-funcs.c100
-rw-r--r--fs/btrfs/transaction.c47
-rw-r--r--fs/btrfs/tree-log.c6
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/btrfs/xattr.c66
23 files changed, 965 insertions, 1010 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 52d7eca8c7b..502b9e98867 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -34,6 +34,9 @@ struct btrfs_inode {
34 */ 34 */
35 struct btrfs_key location; 35 struct btrfs_key location;
36 36
37 /* Lock for counters */
38 spinlock_t lock;
39
37 /* the extent_tree has caches of all the extent mappings to disk */ 40 /* the extent_tree has caches of all the extent mappings to disk */
38 struct extent_map_tree extent_tree; 41 struct extent_map_tree extent_tree;
39 42
@@ -134,8 +137,8 @@ struct btrfs_inode {
134 * items we think we'll end up using, and reserved_extents is the number 137 * items we think we'll end up using, and reserved_extents is the number
135 * of extent items we've reserved metadata for. 138 * of extent items we've reserved metadata for.
136 */ 139 */
137 atomic_t outstanding_extents; 140 unsigned outstanding_extents;
138 atomic_t reserved_extents; 141 unsigned reserved_extents;
139 142
140 /* 143 /*
141 * ordered_data_close is set by truncate when a file that used 144 * ordered_data_close is set by truncate when a file that used
@@ -184,4 +187,13 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
184 BTRFS_I(inode)->disk_i_size = size; 187 BTRFS_I(inode)->disk_i_size = size;
185} 188}
186 189
190static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
191 struct inode *inode)
192{
193 if (root == root->fs_info->tree_root ||
194 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
195 return true;
196 return false;
197}
198
187#endif 199#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2e667868e0d..011cab3aca8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -54,8 +54,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
54{ 54{
55 int i; 55 int i;
56 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 56 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
57 if (p->nodes[i] && p->locks[i]) 57 if (!p->nodes[i] || !p->locks[i])
58 btrfs_set_lock_blocking(p->nodes[i]); 58 continue;
59 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
60 if (p->locks[i] == BTRFS_READ_LOCK)
61 p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
62 else if (p->locks[i] == BTRFS_WRITE_LOCK)
63 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
59 } 64 }
60} 65}
61 66
@@ -68,7 +73,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
68 * for held 73 * for held
69 */ 74 */
70noinline void btrfs_clear_path_blocking(struct btrfs_path *p, 75noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
71 struct extent_buffer *held) 76 struct extent_buffer *held, int held_rw)
72{ 77{
73 int i; 78 int i;
74 79
@@ -79,19 +84,29 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79 * really sure by forcing the path to blocking before we clear 84 * really sure by forcing the path to blocking before we clear
80 * the path blocking. 85 * the path blocking.
81 */ 86 */
82 if (held) 87 if (held) {
83 btrfs_set_lock_blocking(held); 88 btrfs_set_lock_blocking_rw(held, held_rw);
89 if (held_rw == BTRFS_WRITE_LOCK)
90 held_rw = BTRFS_WRITE_LOCK_BLOCKING;
91 else if (held_rw == BTRFS_READ_LOCK)
92 held_rw = BTRFS_READ_LOCK_BLOCKING;
93 }
84 btrfs_set_path_blocking(p); 94 btrfs_set_path_blocking(p);
85#endif 95#endif
86 96
87 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { 97 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
88 if (p->nodes[i] && p->locks[i]) 98 if (p->nodes[i] && p->locks[i]) {
89 btrfs_clear_lock_blocking(p->nodes[i]); 99 btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
100 if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
101 p->locks[i] = BTRFS_WRITE_LOCK;
102 else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
103 p->locks[i] = BTRFS_READ_LOCK;
104 }
90 } 105 }
91 106
92#ifdef CONFIG_DEBUG_LOCK_ALLOC 107#ifdef CONFIG_DEBUG_LOCK_ALLOC
93 if (held) 108 if (held)
94 btrfs_clear_lock_blocking(held); 109 btrfs_clear_lock_blocking_rw(held, held_rw);
95#endif 110#endif
96} 111}
97 112
@@ -119,7 +134,7 @@ noinline void btrfs_release_path(struct btrfs_path *p)
119 if (!p->nodes[i]) 134 if (!p->nodes[i])
120 continue; 135 continue;
121 if (p->locks[i]) { 136 if (p->locks[i]) {
122 btrfs_tree_unlock(p->nodes[i]); 137 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
123 p->locks[i] = 0; 138 p->locks[i] = 0;
124 } 139 }
125 free_extent_buffer(p->nodes[i]); 140 free_extent_buffer(p->nodes[i]);
@@ -167,6 +182,25 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
167 return eb; 182 return eb;
168} 183}
169 184
185/* loop around taking references on and locking the root node of the
186 * tree until you end up with a lock on the root. A locked buffer
187 * is returned, with a reference held.
188 */
189struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
190{
191 struct extent_buffer *eb;
192
193 while (1) {
194 eb = btrfs_root_node(root);
195 btrfs_tree_read_lock(eb);
196 if (eb == root->node)
197 break;
198 btrfs_tree_read_unlock(eb);
199 free_extent_buffer(eb);
200 }
201 return eb;
202}
203
170/* cowonly root (everything not a reference counted cow subvolume), just get 204/* cowonly root (everything not a reference counted cow subvolume), just get
171 * put onto a simple dirty list. transaction.c walks this to make sure they 205 * put onto a simple dirty list. transaction.c walks this to make sure they
172 * get properly updated on disk. 206 * get properly updated on disk.
@@ -626,14 +660,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
626 for (i = start_slot; i < end_slot; i++) { 660 for (i = start_slot; i < end_slot; i++) {
627 int close = 1; 661 int close = 1;
628 662
629 if (!parent->map_token) {
630 map_extent_buffer(parent,
631 btrfs_node_key_ptr_offset(i),
632 sizeof(struct btrfs_key_ptr),
633 &parent->map_token, &parent->kaddr,
634 &parent->map_start, &parent->map_len,
635 KM_USER1);
636 }
637 btrfs_node_key(parent, &disk_key, i); 663 btrfs_node_key(parent, &disk_key, i);
638 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 664 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
639 continue; 665 continue;
@@ -656,11 +682,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
656 last_block = blocknr; 682 last_block = blocknr;
657 continue; 683 continue;
658 } 684 }
659 if (parent->map_token) {
660 unmap_extent_buffer(parent, parent->map_token,
661 KM_USER1);
662 parent->map_token = NULL;
663 }
664 685
665 cur = btrfs_find_tree_block(root, blocknr, blocksize); 686 cur = btrfs_find_tree_block(root, blocknr, blocksize);
666 if (cur) 687 if (cur)
@@ -701,11 +722,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
701 btrfs_tree_unlock(cur); 722 btrfs_tree_unlock(cur);
702 free_extent_buffer(cur); 723 free_extent_buffer(cur);
703 } 724 }
704 if (parent->map_token) {
705 unmap_extent_buffer(parent, parent->map_token,
706 KM_USER1);
707 parent->map_token = NULL;
708 }
709 return err; 725 return err;
710} 726}
711 727
@@ -746,7 +762,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
746 struct btrfs_disk_key *tmp = NULL; 762 struct btrfs_disk_key *tmp = NULL;
747 struct btrfs_disk_key unaligned; 763 struct btrfs_disk_key unaligned;
748 unsigned long offset; 764 unsigned long offset;
749 char *map_token = NULL;
750 char *kaddr = NULL; 765 char *kaddr = NULL;
751 unsigned long map_start = 0; 766 unsigned long map_start = 0;
752 unsigned long map_len = 0; 767 unsigned long map_len = 0;
@@ -756,18 +771,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
756 mid = (low + high) / 2; 771 mid = (low + high) / 2;
757 offset = p + mid * item_size; 772 offset = p + mid * item_size;
758 773
759 if (!map_token || offset < map_start || 774 if (!kaddr || offset < map_start ||
760 (offset + sizeof(struct btrfs_disk_key)) > 775 (offset + sizeof(struct btrfs_disk_key)) >
761 map_start + map_len) { 776 map_start + map_len) {
762 if (map_token) {
763 unmap_extent_buffer(eb, map_token, KM_USER0);
764 map_token = NULL;
765 }
766 777
767 err = map_private_extent_buffer(eb, offset, 778 err = map_private_extent_buffer(eb, offset,
768 sizeof(struct btrfs_disk_key), 779 sizeof(struct btrfs_disk_key),
769 &map_token, &kaddr, 780 &kaddr, &map_start, &map_len);
770 &map_start, &map_len, KM_USER0);
771 781
772 if (!err) { 782 if (!err) {
773 tmp = (struct btrfs_disk_key *)(kaddr + offset - 783 tmp = (struct btrfs_disk_key *)(kaddr + offset -
@@ -790,14 +800,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
790 high = mid; 800 high = mid;
791 else { 801 else {
792 *slot = mid; 802 *slot = mid;
793 if (map_token)
794 unmap_extent_buffer(eb, map_token, KM_USER0);
795 return 0; 803 return 0;
796 } 804 }
797 } 805 }
798 *slot = low; 806 *slot = low;
799 if (map_token)
800 unmap_extent_buffer(eb, map_token, KM_USER0);
801 return 1; 807 return 1;
802} 808}
803 809
@@ -890,7 +896,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
890 896
891 mid = path->nodes[level]; 897 mid = path->nodes[level];
892 898
893 WARN_ON(!path->locks[level]); 899 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
900 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
894 WARN_ON(btrfs_header_generation(mid) != trans->transid); 901 WARN_ON(btrfs_header_generation(mid) != trans->transid);
895 902
896 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 903 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
@@ -1228,7 +1235,6 @@ static void reada_for_search(struct btrfs_root *root,
1228 u32 nr; 1235 u32 nr;
1229 u32 blocksize; 1236 u32 blocksize;
1230 u32 nscan = 0; 1237 u32 nscan = 0;
1231 bool map = true;
1232 1238
1233 if (level != 1) 1239 if (level != 1)
1234 return; 1240 return;
@@ -1250,19 +1256,8 @@ static void reada_for_search(struct btrfs_root *root,
1250 1256
1251 nritems = btrfs_header_nritems(node); 1257 nritems = btrfs_header_nritems(node);
1252 nr = slot; 1258 nr = slot;
1253 if (node->map_token || path->skip_locking)
1254 map = false;
1255 1259
1256 while (1) { 1260 while (1) {
1257 if (map && !node->map_token) {
1258 unsigned long offset = btrfs_node_key_ptr_offset(nr);
1259 map_private_extent_buffer(node, offset,
1260 sizeof(struct btrfs_key_ptr),
1261 &node->map_token,
1262 &node->kaddr,
1263 &node->map_start,
1264 &node->map_len, KM_USER1);
1265 }
1266 if (direction < 0) { 1261 if (direction < 0) {
1267 if (nr == 0) 1262 if (nr == 0)
1268 break; 1263 break;
@@ -1281,11 +1276,6 @@ static void reada_for_search(struct btrfs_root *root,
1281 if ((search <= target && target - search <= 65536) || 1276 if ((search <= target && target - search <= 65536) ||
1282 (search > target && search - target <= 65536)) { 1277 (search > target && search - target <= 65536)) {
1283 gen = btrfs_node_ptr_generation(node, nr); 1278 gen = btrfs_node_ptr_generation(node, nr);
1284 if (map && node->map_token) {
1285 unmap_extent_buffer(node, node->map_token,
1286 KM_USER1);
1287 node->map_token = NULL;
1288 }
1289 readahead_tree_block(root, search, blocksize, gen); 1279 readahead_tree_block(root, search, blocksize, gen);
1290 nread += blocksize; 1280 nread += blocksize;
1291 } 1281 }
@@ -1293,10 +1283,6 @@ static void reada_for_search(struct btrfs_root *root,
1293 if ((nread > 65536 || nscan > 32)) 1283 if ((nread > 65536 || nscan > 32))
1294 break; 1284 break;
1295 } 1285 }
1296 if (map && node->map_token) {
1297 unmap_extent_buffer(node, node->map_token, KM_USER1);
1298 node->map_token = NULL;
1299 }
1300} 1286}
1301 1287
1302/* 1288/*
@@ -1409,7 +1395,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
1409 1395
1410 t = path->nodes[i]; 1396 t = path->nodes[i];
1411 if (i >= lowest_unlock && i > skip_level && path->locks[i]) { 1397 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1412 btrfs_tree_unlock(t); 1398 btrfs_tree_unlock_rw(t, path->locks[i]);
1413 path->locks[i] = 0; 1399 path->locks[i] = 0;
1414 } 1400 }
1415 } 1401 }
@@ -1436,7 +1422,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1436 continue; 1422 continue;
1437 if (!path->locks[i]) 1423 if (!path->locks[i])
1438 continue; 1424 continue;
1439 btrfs_tree_unlock(path->nodes[i]); 1425 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1440 path->locks[i] = 0; 1426 path->locks[i] = 0;
1441 } 1427 }
1442} 1428}
@@ -1485,6 +1471,8 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1485 * we can trust our generation number 1471 * we can trust our generation number
1486 */ 1472 */
1487 free_extent_buffer(tmp); 1473 free_extent_buffer(tmp);
1474 btrfs_set_path_blocking(p);
1475
1488 tmp = read_tree_block(root, blocknr, blocksize, gen); 1476 tmp = read_tree_block(root, blocknr, blocksize, gen);
1489 if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1477 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1490 *eb_ret = tmp; 1478 *eb_ret = tmp;
@@ -1540,20 +1528,27 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1540static int 1528static int
1541setup_nodes_for_search(struct btrfs_trans_handle *trans, 1529setup_nodes_for_search(struct btrfs_trans_handle *trans,
1542 struct btrfs_root *root, struct btrfs_path *p, 1530 struct btrfs_root *root, struct btrfs_path *p,
1543 struct extent_buffer *b, int level, int ins_len) 1531 struct extent_buffer *b, int level, int ins_len,
1532 int *write_lock_level)
1544{ 1533{
1545 int ret; 1534 int ret;
1546 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1535 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1547 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1536 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1548 int sret; 1537 int sret;
1549 1538
1539 if (*write_lock_level < level + 1) {
1540 *write_lock_level = level + 1;
1541 btrfs_release_path(p);
1542 goto again;
1543 }
1544
1550 sret = reada_for_balance(root, p, level); 1545 sret = reada_for_balance(root, p, level);
1551 if (sret) 1546 if (sret)
1552 goto again; 1547 goto again;
1553 1548
1554 btrfs_set_path_blocking(p); 1549 btrfs_set_path_blocking(p);
1555 sret = split_node(trans, root, p, level); 1550 sret = split_node(trans, root, p, level);
1556 btrfs_clear_path_blocking(p, NULL); 1551 btrfs_clear_path_blocking(p, NULL, 0);
1557 1552
1558 BUG_ON(sret > 0); 1553 BUG_ON(sret > 0);
1559 if (sret) { 1554 if (sret) {
@@ -1565,13 +1560,19 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
1565 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { 1560 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
1566 int sret; 1561 int sret;
1567 1562
1563 if (*write_lock_level < level + 1) {
1564 *write_lock_level = level + 1;
1565 btrfs_release_path(p);
1566 goto again;
1567 }
1568
1568 sret = reada_for_balance(root, p, level); 1569 sret = reada_for_balance(root, p, level);
1569 if (sret) 1570 if (sret)
1570 goto again; 1571 goto again;
1571 1572
1572 btrfs_set_path_blocking(p); 1573 btrfs_set_path_blocking(p);
1573 sret = balance_level(trans, root, p, level); 1574 sret = balance_level(trans, root, p, level);
1574 btrfs_clear_path_blocking(p, NULL); 1575 btrfs_clear_path_blocking(p, NULL, 0);
1575 1576
1576 if (sret) { 1577 if (sret) {
1577 ret = sret; 1578 ret = sret;
@@ -1615,27 +1616,78 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1615 int err; 1616 int err;
1616 int level; 1617 int level;
1617 int lowest_unlock = 1; 1618 int lowest_unlock = 1;
1619 int root_lock;
1620 /* everything at write_lock_level or lower must be write locked */
1621 int write_lock_level = 0;
1618 u8 lowest_level = 0; 1622 u8 lowest_level = 0;
1619 1623
1620 lowest_level = p->lowest_level; 1624 lowest_level = p->lowest_level;
1621 WARN_ON(lowest_level && ins_len > 0); 1625 WARN_ON(lowest_level && ins_len > 0);
1622 WARN_ON(p->nodes[0] != NULL); 1626 WARN_ON(p->nodes[0] != NULL);
1623 1627
1624 if (ins_len < 0) 1628 if (ins_len < 0) {
1625 lowest_unlock = 2; 1629 lowest_unlock = 2;
1626 1630
1631 /* when we are removing items, we might have to go up to level
1632 * two as we update tree pointers Make sure we keep write
1633 * for those levels as well
1634 */
1635 write_lock_level = 2;
1636 } else if (ins_len > 0) {
1637 /*
1638 * for inserting items, make sure we have a write lock on
1639 * level 1 so we can update keys
1640 */
1641 write_lock_level = 1;
1642 }
1643
1644 if (!cow)
1645 write_lock_level = -1;
1646
1647 if (cow && (p->keep_locks || p->lowest_level))
1648 write_lock_level = BTRFS_MAX_LEVEL;
1649
1627again: 1650again:
1651 /*
1652 * we try very hard to do read locks on the root
1653 */
1654 root_lock = BTRFS_READ_LOCK;
1655 level = 0;
1628 if (p->search_commit_root) { 1656 if (p->search_commit_root) {
1657 /*
1658 * the commit roots are read only
1659 * so we always do read locks
1660 */
1629 b = root->commit_root; 1661 b = root->commit_root;
1630 extent_buffer_get(b); 1662 extent_buffer_get(b);
1663 level = btrfs_header_level(b);
1631 if (!p->skip_locking) 1664 if (!p->skip_locking)
1632 btrfs_tree_lock(b); 1665 btrfs_tree_read_lock(b);
1633 } else { 1666 } else {
1634 if (p->skip_locking) 1667 if (p->skip_locking) {
1635 b = btrfs_root_node(root); 1668 b = btrfs_root_node(root);
1636 else 1669 level = btrfs_header_level(b);
1637 b = btrfs_lock_root_node(root); 1670 } else {
1671 /* we don't know the level of the root node
1672 * until we actually have it read locked
1673 */
1674 b = btrfs_read_lock_root_node(root);
1675 level = btrfs_header_level(b);
1676 if (level <= write_lock_level) {
1677 /* whoops, must trade for write lock */
1678 btrfs_tree_read_unlock(b);
1679 free_extent_buffer(b);
1680 b = btrfs_lock_root_node(root);
1681 root_lock = BTRFS_WRITE_LOCK;
1682
1683 /* the level might have changed, check again */
1684 level = btrfs_header_level(b);
1685 }
1686 }
1638 } 1687 }
1688 p->nodes[level] = b;
1689 if (!p->skip_locking)
1690 p->locks[level] = root_lock;
1639 1691
1640 while (b) { 1692 while (b) {
1641 level = btrfs_header_level(b); 1693 level = btrfs_header_level(b);
@@ -1644,10 +1696,6 @@ again:
1644 * setup the path here so we can release it under lock 1696 * setup the path here so we can release it under lock
1645 * contention with the cow code 1697 * contention with the cow code
1646 */ 1698 */
1647 p->nodes[level] = b;
1648 if (!p->skip_locking)
1649 p->locks[level] = 1;
1650
1651 if (cow) { 1699 if (cow) {
1652 /* 1700 /*
1653 * if we don't really need to cow this block 1701 * if we don't really need to cow this block
@@ -1659,6 +1707,16 @@ again:
1659 1707
1660 btrfs_set_path_blocking(p); 1708 btrfs_set_path_blocking(p);
1661 1709
1710 /*
1711 * must have write locks on this node and the
1712 * parent
1713 */
1714 if (level + 1 > write_lock_level) {
1715 write_lock_level = level + 1;
1716 btrfs_release_path(p);
1717 goto again;
1718 }
1719
1662 err = btrfs_cow_block(trans, root, b, 1720 err = btrfs_cow_block(trans, root, b,
1663 p->nodes[level + 1], 1721 p->nodes[level + 1],
1664 p->slots[level + 1], &b); 1722 p->slots[level + 1], &b);
@@ -1671,10 +1729,7 @@ cow_done:
1671 BUG_ON(!cow && ins_len); 1729 BUG_ON(!cow && ins_len);
1672 1730
1673 p->nodes[level] = b; 1731 p->nodes[level] = b;
1674 if (!p->skip_locking) 1732 btrfs_clear_path_blocking(p, NULL, 0);
1675 p->locks[level] = 1;
1676
1677 btrfs_clear_path_blocking(p, NULL);
1678 1733
1679 /* 1734 /*
1680 * we have a lock on b and as long as we aren't changing 1735 * we have a lock on b and as long as we aren't changing
@@ -1700,7 +1755,7 @@ cow_done:
1700 } 1755 }
1701 p->slots[level] = slot; 1756 p->slots[level] = slot;
1702 err = setup_nodes_for_search(trans, root, p, b, level, 1757 err = setup_nodes_for_search(trans, root, p, b, level,
1703 ins_len); 1758 ins_len, &write_lock_level);
1704 if (err == -EAGAIN) 1759 if (err == -EAGAIN)
1705 goto again; 1760 goto again;
1706 if (err) { 1761 if (err) {
@@ -1710,6 +1765,19 @@ cow_done:
1710 b = p->nodes[level]; 1765 b = p->nodes[level];
1711 slot = p->slots[level]; 1766 slot = p->slots[level];
1712 1767
1768 /*
1769 * slot 0 is special, if we change the key
1770 * we have to update the parent pointer
1771 * which means we must have a write lock
1772 * on the parent
1773 */
1774 if (slot == 0 && cow &&
1775 write_lock_level < level + 1) {
1776 write_lock_level = level + 1;
1777 btrfs_release_path(p);
1778 goto again;
1779 }
1780
1713 unlock_up(p, level, lowest_unlock); 1781 unlock_up(p, level, lowest_unlock);
1714 1782
1715 if (level == lowest_level) { 1783 if (level == lowest_level) {
@@ -1728,23 +1796,42 @@ cow_done:
1728 } 1796 }
1729 1797
1730 if (!p->skip_locking) { 1798 if (!p->skip_locking) {
1731 btrfs_clear_path_blocking(p, NULL); 1799 level = btrfs_header_level(b);
1732 err = btrfs_try_spin_lock(b); 1800 if (level <= write_lock_level) {
1733 1801 err = btrfs_try_tree_write_lock(b);
1734 if (!err) { 1802 if (!err) {
1735 btrfs_set_path_blocking(p); 1803 btrfs_set_path_blocking(p);
1736 btrfs_tree_lock(b); 1804 btrfs_tree_lock(b);
1737 btrfs_clear_path_blocking(p, b); 1805 btrfs_clear_path_blocking(p, b,
1806 BTRFS_WRITE_LOCK);
1807 }
1808 p->locks[level] = BTRFS_WRITE_LOCK;
1809 } else {
1810 err = btrfs_try_tree_read_lock(b);
1811 if (!err) {
1812 btrfs_set_path_blocking(p);
1813 btrfs_tree_read_lock(b);
1814 btrfs_clear_path_blocking(p, b,
1815 BTRFS_READ_LOCK);
1816 }
1817 p->locks[level] = BTRFS_READ_LOCK;
1738 } 1818 }
1819 p->nodes[level] = b;
1739 } 1820 }
1740 } else { 1821 } else {
1741 p->slots[level] = slot; 1822 p->slots[level] = slot;
1742 if (ins_len > 0 && 1823 if (ins_len > 0 &&
1743 btrfs_leaf_free_space(root, b) < ins_len) { 1824 btrfs_leaf_free_space(root, b) < ins_len) {
1825 if (write_lock_level < 1) {
1826 write_lock_level = 1;
1827 btrfs_release_path(p);
1828 goto again;
1829 }
1830
1744 btrfs_set_path_blocking(p); 1831 btrfs_set_path_blocking(p);
1745 err = split_leaf(trans, root, key, 1832 err = split_leaf(trans, root, key,
1746 p, ins_len, ret == 0); 1833 p, ins_len, ret == 0);
1747 btrfs_clear_path_blocking(p, NULL); 1834 btrfs_clear_path_blocking(p, NULL, 0);
1748 1835
1749 BUG_ON(err > 0); 1836 BUG_ON(err > 0);
1750 if (err) { 1837 if (err) {
@@ -2025,7 +2112,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2025 add_root_to_dirty_list(root); 2112 add_root_to_dirty_list(root);
2026 extent_buffer_get(c); 2113 extent_buffer_get(c);
2027 path->nodes[level] = c; 2114 path->nodes[level] = c;
2028 path->locks[level] = 1; 2115 path->locks[level] = BTRFS_WRITE_LOCK;
2029 path->slots[level] = 0; 2116 path->slots[level] = 0;
2030 return 0; 2117 return 0;
2031} 2118}
@@ -2253,14 +2340,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2253 if (path->slots[0] == i) 2340 if (path->slots[0] == i)
2254 push_space += data_size; 2341 push_space += data_size;
2255 2342
2256 if (!left->map_token) {
2257 map_extent_buffer(left, (unsigned long)item,
2258 sizeof(struct btrfs_item),
2259 &left->map_token, &left->kaddr,
2260 &left->map_start, &left->map_len,
2261 KM_USER1);
2262 }
2263
2264 this_item_size = btrfs_item_size(left, item); 2343 this_item_size = btrfs_item_size(left, item);
2265 if (this_item_size + sizeof(*item) + push_space > free_space) 2344 if (this_item_size + sizeof(*item) + push_space > free_space)
2266 break; 2345 break;
@@ -2271,10 +2350,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2271 break; 2350 break;
2272 i--; 2351 i--;
2273 } 2352 }
2274 if (left->map_token) {
2275 unmap_extent_buffer(left, left->map_token, KM_USER1);
2276 left->map_token = NULL;
2277 }
2278 2353
2279 if (push_items == 0) 2354 if (push_items == 0)
2280 goto out_unlock; 2355 goto out_unlock;
@@ -2316,21 +2391,10 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2316 push_space = BTRFS_LEAF_DATA_SIZE(root); 2391 push_space = BTRFS_LEAF_DATA_SIZE(root);
2317 for (i = 0; i < right_nritems; i++) { 2392 for (i = 0; i < right_nritems; i++) {
2318 item = btrfs_item_nr(right, i); 2393 item = btrfs_item_nr(right, i);
2319 if (!right->map_token) {
2320 map_extent_buffer(right, (unsigned long)item,
2321 sizeof(struct btrfs_item),
2322 &right->map_token, &right->kaddr,
2323 &right->map_start, &right->map_len,
2324 KM_USER1);
2325 }
2326 push_space -= btrfs_item_size(right, item); 2394 push_space -= btrfs_item_size(right, item);
2327 btrfs_set_item_offset(right, item, push_space); 2395 btrfs_set_item_offset(right, item, push_space);
2328 } 2396 }
2329 2397
2330 if (right->map_token) {
2331 unmap_extent_buffer(right, right->map_token, KM_USER1);
2332 right->map_token = NULL;
2333 }
2334 left_nritems -= push_items; 2398 left_nritems -= push_items;
2335 btrfs_set_header_nritems(left, left_nritems); 2399 btrfs_set_header_nritems(left, left_nritems);
2336 2400
@@ -2467,13 +2531,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2467 2531
2468 for (i = 0; i < nr; i++) { 2532 for (i = 0; i < nr; i++) {
2469 item = btrfs_item_nr(right, i); 2533 item = btrfs_item_nr(right, i);
2470 if (!right->map_token) {
2471 map_extent_buffer(right, (unsigned long)item,
2472 sizeof(struct btrfs_item),
2473 &right->map_token, &right->kaddr,
2474 &right->map_start, &right->map_len,
2475 KM_USER1);
2476 }
2477 2534
2478 if (!empty && push_items > 0) { 2535 if (!empty && push_items > 0) {
2479 if (path->slots[0] < i) 2536 if (path->slots[0] < i)
@@ -2496,11 +2553,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2496 push_space += this_item_size + sizeof(*item); 2553 push_space += this_item_size + sizeof(*item);
2497 } 2554 }
2498 2555
2499 if (right->map_token) {
2500 unmap_extent_buffer(right, right->map_token, KM_USER1);
2501 right->map_token = NULL;
2502 }
2503
2504 if (push_items == 0) { 2556 if (push_items == 0) {
2505 ret = 1; 2557 ret = 1;
2506 goto out; 2558 goto out;
@@ -2530,23 +2582,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2530 u32 ioff; 2582 u32 ioff;
2531 2583
2532 item = btrfs_item_nr(left, i); 2584 item = btrfs_item_nr(left, i);
2533 if (!left->map_token) {
2534 map_extent_buffer(left, (unsigned long)item,
2535 sizeof(struct btrfs_item),
2536 &left->map_token, &left->kaddr,
2537 &left->map_start, &left->map_len,
2538 KM_USER1);
2539 }
2540 2585
2541 ioff = btrfs_item_offset(left, item); 2586 ioff = btrfs_item_offset(left, item);
2542 btrfs_set_item_offset(left, item, 2587 btrfs_set_item_offset(left, item,
2543 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); 2588 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2544 } 2589 }
2545 btrfs_set_header_nritems(left, old_left_nritems + push_items); 2590 btrfs_set_header_nritems(left, old_left_nritems + push_items);
2546 if (left->map_token) {
2547 unmap_extent_buffer(left, left->map_token, KM_USER1);
2548 left->map_token = NULL;
2549 }
2550 2591
2551 /* fixup right node */ 2592 /* fixup right node */
2552 if (push_items > right_nritems) { 2593 if (push_items > right_nritems) {
@@ -2574,21 +2615,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2574 for (i = 0; i < right_nritems; i++) { 2615 for (i = 0; i < right_nritems; i++) {
2575 item = btrfs_item_nr(right, i); 2616 item = btrfs_item_nr(right, i);
2576 2617
2577 if (!right->map_token) {
2578 map_extent_buffer(right, (unsigned long)item,
2579 sizeof(struct btrfs_item),
2580 &right->map_token, &right->kaddr,
2581 &right->map_start, &right->map_len,
2582 KM_USER1);
2583 }
2584
2585 push_space = push_space - btrfs_item_size(right, item); 2618 push_space = push_space - btrfs_item_size(right, item);
2586 btrfs_set_item_offset(right, item, push_space); 2619 btrfs_set_item_offset(right, item, push_space);
2587 } 2620 }
2588 if (right->map_token) {
2589 unmap_extent_buffer(right, right->map_token, KM_USER1);
2590 right->map_token = NULL;
2591 }
2592 2621
2593 btrfs_mark_buffer_dirty(left); 2622 btrfs_mark_buffer_dirty(left);
2594 if (right_nritems) 2623 if (right_nritems)
@@ -2729,23 +2758,10 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2729 struct btrfs_item *item = btrfs_item_nr(right, i); 2758 struct btrfs_item *item = btrfs_item_nr(right, i);
2730 u32 ioff; 2759 u32 ioff;
2731 2760
2732 if (!right->map_token) {
2733 map_extent_buffer(right, (unsigned long)item,
2734 sizeof(struct btrfs_item),
2735 &right->map_token, &right->kaddr,
2736 &right->map_start, &right->map_len,
2737 KM_USER1);
2738 }
2739
2740 ioff = btrfs_item_offset(right, item); 2761 ioff = btrfs_item_offset(right, item);
2741 btrfs_set_item_offset(right, item, ioff + rt_data_off); 2762 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2742 } 2763 }
2743 2764
2744 if (right->map_token) {
2745 unmap_extent_buffer(right, right->map_token, KM_USER1);
2746 right->map_token = NULL;
2747 }
2748
2749 btrfs_set_header_nritems(l, mid); 2765 btrfs_set_header_nritems(l, mid);
2750 ret = 0; 2766 ret = 0;
2751 btrfs_item_key(right, &disk_key, 0); 2767 btrfs_item_key(right, &disk_key, 0);
@@ -3264,23 +3280,10 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3264 u32 ioff; 3280 u32 ioff;
3265 item = btrfs_item_nr(leaf, i); 3281 item = btrfs_item_nr(leaf, i);
3266 3282
3267 if (!leaf->map_token) {
3268 map_extent_buffer(leaf, (unsigned long)item,
3269 sizeof(struct btrfs_item),
3270 &leaf->map_token, &leaf->kaddr,
3271 &leaf->map_start, &leaf->map_len,
3272 KM_USER1);
3273 }
3274
3275 ioff = btrfs_item_offset(leaf, item); 3283 ioff = btrfs_item_offset(leaf, item);
3276 btrfs_set_item_offset(leaf, item, ioff + size_diff); 3284 btrfs_set_item_offset(leaf, item, ioff + size_diff);
3277 } 3285 }
3278 3286
3279 if (leaf->map_token) {
3280 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3281 leaf->map_token = NULL;
3282 }
3283
3284 /* shift the data */ 3287 /* shift the data */
3285 if (from_end) { 3288 if (from_end) {
3286 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 3289 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
@@ -3377,22 +3380,10 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
3377 u32 ioff; 3380 u32 ioff;
3378 item = btrfs_item_nr(leaf, i); 3381 item = btrfs_item_nr(leaf, i);
3379 3382
3380 if (!leaf->map_token) {
3381 map_extent_buffer(leaf, (unsigned long)item,
3382 sizeof(struct btrfs_item),
3383 &leaf->map_token, &leaf->kaddr,
3384 &leaf->map_start, &leaf->map_len,
3385 KM_USER1);
3386 }
3387 ioff = btrfs_item_offset(leaf, item); 3383 ioff = btrfs_item_offset(leaf, item);
3388 btrfs_set_item_offset(leaf, item, ioff - data_size); 3384 btrfs_set_item_offset(leaf, item, ioff - data_size);
3389 } 3385 }
3390 3386
3391 if (leaf->map_token) {
3392 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3393 leaf->map_token = NULL;
3394 }
3395
3396 /* shift the data */ 3387 /* shift the data */
3397 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 3388 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3398 data_end - data_size, btrfs_leaf_data(leaf) + 3389 data_end - data_size, btrfs_leaf_data(leaf) +
@@ -3494,27 +3485,13 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3494 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3485 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3495 */ 3486 */
3496 /* first correct the data pointers */ 3487 /* first correct the data pointers */
3497 WARN_ON(leaf->map_token);
3498 for (i = slot; i < nritems; i++) { 3488 for (i = slot; i < nritems; i++) {
3499 u32 ioff; 3489 u32 ioff;
3500 3490
3501 item = btrfs_item_nr(leaf, i); 3491 item = btrfs_item_nr(leaf, i);
3502 if (!leaf->map_token) {
3503 map_extent_buffer(leaf, (unsigned long)item,
3504 sizeof(struct btrfs_item),
3505 &leaf->map_token, &leaf->kaddr,
3506 &leaf->map_start, &leaf->map_len,
3507 KM_USER1);
3508 }
3509
3510 ioff = btrfs_item_offset(leaf, item); 3492 ioff = btrfs_item_offset(leaf, item);
3511 btrfs_set_item_offset(leaf, item, ioff - total_data); 3493 btrfs_set_item_offset(leaf, item, ioff - total_data);
3512 } 3494 }
3513 if (leaf->map_token) {
3514 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3515 leaf->map_token = NULL;
3516 }
3517
3518 /* shift the items */ 3495 /* shift the items */
3519 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), 3496 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3520 btrfs_item_nr_offset(slot), 3497 btrfs_item_nr_offset(slot),
@@ -3608,27 +3585,13 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,
3608 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3585 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3609 */ 3586 */
3610 /* first correct the data pointers */ 3587 /* first correct the data pointers */
3611 WARN_ON(leaf->map_token);
3612 for (i = slot; i < nritems; i++) { 3588 for (i = slot; i < nritems; i++) {
3613 u32 ioff; 3589 u32 ioff;
3614 3590
3615 item = btrfs_item_nr(leaf, i); 3591 item = btrfs_item_nr(leaf, i);
3616 if (!leaf->map_token) {
3617 map_extent_buffer(leaf, (unsigned long)item,
3618 sizeof(struct btrfs_item),
3619 &leaf->map_token, &leaf->kaddr,
3620 &leaf->map_start, &leaf->map_len,
3621 KM_USER1);
3622 }
3623
3624 ioff = btrfs_item_offset(leaf, item); 3592 ioff = btrfs_item_offset(leaf, item);
3625 btrfs_set_item_offset(leaf, item, ioff - total_data); 3593 btrfs_set_item_offset(leaf, item, ioff - total_data);
3626 } 3594 }
3627 if (leaf->map_token) {
3628 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3629 leaf->map_token = NULL;
3630 }
3631
3632 /* shift the items */ 3595 /* shift the items */
3633 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), 3596 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3634 btrfs_item_nr_offset(slot), 3597 btrfs_item_nr_offset(slot),
@@ -3840,22 +3803,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3840 u32 ioff; 3803 u32 ioff;
3841 3804
3842 item = btrfs_item_nr(leaf, i); 3805 item = btrfs_item_nr(leaf, i);
3843 if (!leaf->map_token) {
3844 map_extent_buffer(leaf, (unsigned long)item,
3845 sizeof(struct btrfs_item),
3846 &leaf->map_token, &leaf->kaddr,
3847 &leaf->map_start, &leaf->map_len,
3848 KM_USER1);
3849 }
3850 ioff = btrfs_item_offset(leaf, item); 3806 ioff = btrfs_item_offset(leaf, item);
3851 btrfs_set_item_offset(leaf, item, ioff + dsize); 3807 btrfs_set_item_offset(leaf, item, ioff + dsize);
3852 } 3808 }
3853 3809
3854 if (leaf->map_token) {
3855 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3856 leaf->map_token = NULL;
3857 }
3858
3859 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), 3810 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3860 btrfs_item_nr_offset(slot + nr), 3811 btrfs_item_nr_offset(slot + nr),
3861 sizeof(struct btrfs_item) * 3812 sizeof(struct btrfs_item) *
@@ -4004,11 +3955,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4004 3955
4005 WARN_ON(!path->keep_locks); 3956 WARN_ON(!path->keep_locks);
4006again: 3957again:
4007 cur = btrfs_lock_root_node(root); 3958 cur = btrfs_read_lock_root_node(root);
4008 level = btrfs_header_level(cur); 3959 level = btrfs_header_level(cur);
4009 WARN_ON(path->nodes[level]); 3960 WARN_ON(path->nodes[level]);
4010 path->nodes[level] = cur; 3961 path->nodes[level] = cur;
4011 path->locks[level] = 1; 3962 path->locks[level] = BTRFS_READ_LOCK;
4012 3963
4013 if (btrfs_header_generation(cur) < min_trans) { 3964 if (btrfs_header_generation(cur) < min_trans) {
4014 ret = 1; 3965 ret = 1;
@@ -4098,12 +4049,12 @@ find_next_key:
4098 cur = read_node_slot(root, cur, slot); 4049 cur = read_node_slot(root, cur, slot);
4099 BUG_ON(!cur); 4050 BUG_ON(!cur);
4100 4051
4101 btrfs_tree_lock(cur); 4052 btrfs_tree_read_lock(cur);
4102 4053
4103 path->locks[level - 1] = 1; 4054 path->locks[level - 1] = BTRFS_READ_LOCK;
4104 path->nodes[level - 1] = cur; 4055 path->nodes[level - 1] = cur;
4105 unlock_up(path, level, 1); 4056 unlock_up(path, level, 1);
4106 btrfs_clear_path_blocking(path, NULL); 4057 btrfs_clear_path_blocking(path, NULL, 0);
4107 } 4058 }
4108out: 4059out:
4109 if (ret == 0) 4060 if (ret == 0)
@@ -4218,30 +4169,21 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4218 u32 nritems; 4169 u32 nritems;
4219 int ret; 4170 int ret;
4220 int old_spinning = path->leave_spinning; 4171 int old_spinning = path->leave_spinning;
4221 int force_blocking = 0; 4172 int next_rw_lock = 0;
4222 4173
4223 nritems = btrfs_header_nritems(path->nodes[0]); 4174 nritems = btrfs_header_nritems(path->nodes[0]);
4224 if (nritems == 0) 4175 if (nritems == 0)
4225 return 1; 4176 return 1;
4226 4177
4227 /*
4228 * we take the blocks in an order that upsets lockdep. Using
4229 * blocking mode is the only way around it.
4230 */
4231#ifdef CONFIG_DEBUG_LOCK_ALLOC
4232 force_blocking = 1;
4233#endif
4234
4235 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4178 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4236again: 4179again:
4237 level = 1; 4180 level = 1;
4238 next = NULL; 4181 next = NULL;
4182 next_rw_lock = 0;
4239 btrfs_release_path(path); 4183 btrfs_release_path(path);
4240 4184
4241 path->keep_locks = 1; 4185 path->keep_locks = 1;
4242 4186 path->leave_spinning = 1;
4243 if (!force_blocking)
4244 path->leave_spinning = 1;
4245 4187
4246 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4247 path->keep_locks = 0; 4189 path->keep_locks = 0;
@@ -4281,11 +4223,12 @@ again:
4281 } 4223 }
4282 4224
4283 if (next) { 4225 if (next) {
4284 btrfs_tree_unlock(next); 4226 btrfs_tree_unlock_rw(next, next_rw_lock);
4285 free_extent_buffer(next); 4227 free_extent_buffer(next);
4286 } 4228 }
4287 4229
4288 next = c; 4230 next = c;
4231 next_rw_lock = path->locks[level];
4289 ret = read_block_for_search(NULL, root, path, &next, level, 4232 ret = read_block_for_search(NULL, root, path, &next, level,
4290 slot, &key); 4233 slot, &key);
4291 if (ret == -EAGAIN) 4234 if (ret == -EAGAIN)
@@ -4297,15 +4240,14 @@ again:
4297 } 4240 }
4298 4241
4299 if (!path->skip_locking) { 4242 if (!path->skip_locking) {
4300 ret = btrfs_try_spin_lock(next); 4243 ret = btrfs_try_tree_read_lock(next);
4301 if (!ret) { 4244 if (!ret) {
4302 btrfs_set_path_blocking(path); 4245 btrfs_set_path_blocking(path);
4303 btrfs_tree_lock(next); 4246 btrfs_tree_read_lock(next);
4304 if (!force_blocking) 4247 btrfs_clear_path_blocking(path, next,
4305 btrfs_clear_path_blocking(path, next); 4248 BTRFS_READ_LOCK);
4306 } 4249 }
4307 if (force_blocking) 4250 next_rw_lock = BTRFS_READ_LOCK;
4308 btrfs_set_lock_blocking(next);
4309 } 4251 }
4310 break; 4252 break;
4311 } 4253 }
@@ -4314,14 +4256,13 @@ again:
4314 level--; 4256 level--;
4315 c = path->nodes[level]; 4257 c = path->nodes[level];
4316 if (path->locks[level]) 4258 if (path->locks[level])
4317 btrfs_tree_unlock(c); 4259 btrfs_tree_unlock_rw(c, path->locks[level]);
4318 4260
4319 free_extent_buffer(c); 4261 free_extent_buffer(c);
4320 path->nodes[level] = next; 4262 path->nodes[level] = next;
4321 path->slots[level] = 0; 4263 path->slots[level] = 0;
4322 if (!path->skip_locking) 4264 if (!path->skip_locking)
4323 path->locks[level] = 1; 4265 path->locks[level] = next_rw_lock;
4324
4325 if (!level) 4266 if (!level)
4326 break; 4267 break;
4327 4268
@@ -4336,16 +4277,14 @@ again:
4336 } 4277 }
4337 4278
4338 if (!path->skip_locking) { 4279 if (!path->skip_locking) {
4339 btrfs_assert_tree_locked(path->nodes[level]); 4280 ret = btrfs_try_tree_read_lock(next);
4340 ret = btrfs_try_spin_lock(next);
4341 if (!ret) { 4281 if (!ret) {
4342 btrfs_set_path_blocking(path); 4282 btrfs_set_path_blocking(path);
4343 btrfs_tree_lock(next); 4283 btrfs_tree_read_lock(next);
4344 if (!force_blocking) 4284 btrfs_clear_path_blocking(path, next,
4345 btrfs_clear_path_blocking(path, next); 4285 BTRFS_READ_LOCK);
4346 } 4286 }
4347 if (force_blocking) 4287 next_rw_lock = BTRFS_READ_LOCK;
4348 btrfs_set_lock_blocking(next);
4349 } 4288 }
4350 } 4289 }
4351 ret = 0; 4290 ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fe9287b0649..365c4e1dde0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -755,6 +755,8 @@ struct btrfs_space_info {
755 chunks for this space */ 755 chunks for this space */
756 unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ 756 unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
757 757
758 unsigned int flush:1; /* set if we are trying to make space */
759
758 unsigned int force_alloc; /* set if we need to force a chunk 760 unsigned int force_alloc; /* set if we need to force a chunk
759 alloc for this space */ 761 alloc for this space */
760 762
@@ -764,7 +766,7 @@ struct btrfs_space_info {
764 struct list_head block_groups[BTRFS_NR_RAID_TYPES]; 766 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
765 spinlock_t lock; 767 spinlock_t lock;
766 struct rw_semaphore groups_sem; 768 struct rw_semaphore groups_sem;
767 atomic_t caching_threads; 769 wait_queue_head_t wait;
768}; 770};
769 771
770struct btrfs_block_rsv { 772struct btrfs_block_rsv {
@@ -824,6 +826,7 @@ struct btrfs_caching_control {
824 struct list_head list; 826 struct list_head list;
825 struct mutex mutex; 827 struct mutex mutex;
826 wait_queue_head_t wait; 828 wait_queue_head_t wait;
829 struct btrfs_work work;
827 struct btrfs_block_group_cache *block_group; 830 struct btrfs_block_group_cache *block_group;
828 u64 progress; 831 u64 progress;
829 atomic_t count; 832 atomic_t count;
@@ -1032,6 +1035,8 @@ struct btrfs_fs_info {
1032 struct btrfs_workers endio_write_workers; 1035 struct btrfs_workers endio_write_workers;
1033 struct btrfs_workers endio_freespace_worker; 1036 struct btrfs_workers endio_freespace_worker;
1034 struct btrfs_workers submit_workers; 1037 struct btrfs_workers submit_workers;
1038 struct btrfs_workers caching_workers;
1039
1035 /* 1040 /*
1036 * fixup workers take dirty pages that didn't properly go through 1041 * fixup workers take dirty pages that didn't properly go through
1037 * the cow mechanism and make them safe to write. It happens 1042 * the cow mechanism and make them safe to write. It happens
@@ -2128,7 +2133,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2128 2133
2129/* extent-tree.c */ 2134/* extent-tree.c */
2130static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 2135static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2131 int num_items) 2136 unsigned num_items)
2132{ 2137{
2133 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 2138 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2134 3 * num_items; 2139 3 * num_items;
@@ -2222,9 +2227,6 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2222void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2227void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2223int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 2228int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2224void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 2229void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2225int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2226 struct btrfs_root *root,
2227 int num_items);
2228void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 2230void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2229 struct btrfs_root *root); 2231 struct btrfs_root *root);
2230int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 2232int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2330,7 +2332,7 @@ struct btrfs_path *btrfs_alloc_path(void);
2330void btrfs_free_path(struct btrfs_path *p); 2332void btrfs_free_path(struct btrfs_path *p);
2331void btrfs_set_path_blocking(struct btrfs_path *p); 2333void btrfs_set_path_blocking(struct btrfs_path *p);
2332void btrfs_clear_path_blocking(struct btrfs_path *p, 2334void btrfs_clear_path_blocking(struct btrfs_path *p,
2333 struct extent_buffer *held); 2335 struct extent_buffer *held, int held_rw);
2334void btrfs_unlock_up_safe(struct btrfs_path *p, int level); 2336void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
2335 2337
2336int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2338int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 98c68e658a9..b52c672f4c1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -735,7 +735,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
735 } 735 }
736 736
737 /* reset all the locked nodes in the patch to spinning locks. */ 737 /* reset all the locked nodes in the patch to spinning locks. */
738 btrfs_clear_path_blocking(path, NULL); 738 btrfs_clear_path_blocking(path, NULL, 0);
739 739
740 /* insert the keys of the items */ 740 /* insert the keys of the items */
741 ret = setup_items_for_insert(trans, root, path, keys, data_size, 741 ret = setup_items_for_insert(trans, root, path, keys, data_size,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 685f2593c4f..c360a848d97 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -89,13 +89,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
89 data_size = sizeof(*dir_item) + name_len + data_len; 89 data_size = sizeof(*dir_item) + name_len + data_len;
90 dir_item = insert_with_overflow(trans, root, path, &key, data_size, 90 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
91 name, name_len); 91 name, name_len);
92 /* 92 if (IS_ERR(dir_item))
93 * FIXME: at some point we should handle xattr's that are larger than 93 return PTR_ERR(dir_item);
94 * what we can fit in our leaf. We set location to NULL b/c we arent
95 * pointing at anything else, that will change if we store the xattr
96 * data in a separate inode.
97 */
98 BUG_ON(IS_ERR(dir_item));
99 memset(&location, 0, sizeof(location)); 94 memset(&location, 0, sizeof(location));
100 95
101 leaf = path->nodes[0]; 96 leaf = path->nodes[0];
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b231ae13b26..07b3ac662e1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -100,38 +100,83 @@ struct async_submit_bio {
100 struct btrfs_work work; 100 struct btrfs_work work;
101}; 101};
102 102
103/* These are used to set the lockdep class on the extent buffer locks. 103/*
104 * The class is set by the readpage_end_io_hook after the buffer has 104 * Lockdep class keys for extent_buffer->lock's in this root. For a given
105 * passed csum validation but before the pages are unlocked. 105 * eb, the lockdep key is determined by the btrfs_root it belongs to and
106 * the level the eb occupies in the tree.
107 *
108 * Different roots are used for different purposes and may nest inside each
109 * other and they require separate keysets. As lockdep keys should be
110 * static, assign keysets according to the purpose of the root as indicated
111 * by btrfs_root->objectid. This ensures that all special purpose roots
112 * have separate keysets.
106 * 113 *
107 * The lockdep class is also set by btrfs_init_new_buffer on freshly 114 * Lock-nesting across peer nodes is always done with the immediate parent
108 * allocated blocks. 115 * node locked thus preventing deadlock. As lockdep doesn't know this, use
116 * subclass to avoid triggering lockdep warning in such cases.
109 * 117 *
110 * The class is based on the level in the tree block, which allows lockdep 118 * The key is set by the readpage_end_io_hook after the buffer has passed
111 * to know that lower nodes nest inside the locks of higher nodes. 119 * csum validation but before the pages are unlocked. It is also set by
120 * btrfs_init_new_buffer on freshly allocated blocks.
112 * 121 *
113 * We also add a check to make sure the highest level of the tree is 122 * We also add a check to make sure the highest level of the tree is the
114 * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this 123 * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
115 * code needs update as well. 124 * needs update as well.
116 */ 125 */
117#ifdef CONFIG_DEBUG_LOCK_ALLOC 126#ifdef CONFIG_DEBUG_LOCK_ALLOC
118# if BTRFS_MAX_LEVEL != 8 127# if BTRFS_MAX_LEVEL != 8
119# error 128# error
120# endif 129# endif
121static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; 130
122static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { 131static struct btrfs_lockdep_keyset {
123 /* leaf */ 132 u64 id; /* root objectid */
124 "btrfs-extent-00", 133 const char *name_stem; /* lock name stem */
125 "btrfs-extent-01", 134 char names[BTRFS_MAX_LEVEL + 1][20];
126 "btrfs-extent-02", 135 struct lock_class_key keys[BTRFS_MAX_LEVEL + 1];
127 "btrfs-extent-03", 136} btrfs_lockdep_keysets[] = {
128 "btrfs-extent-04", 137 { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" },
129 "btrfs-extent-05", 138 { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" },
130 "btrfs-extent-06", 139 { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" },
131 "btrfs-extent-07", 140 { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
132 /* highest possible level */ 141 { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
133 "btrfs-extent-08", 142 { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
143 { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" },
144 { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
145 { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
146 { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
147 { .id = 0, .name_stem = "tree" },
134}; 148};
149
150void __init btrfs_init_lockdep(void)
151{
152 int i, j;
153
154 /* initialize lockdep class names */
155 for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
156 struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
157
158 for (j = 0; j < ARRAY_SIZE(ks->names); j++)
159 snprintf(ks->names[j], sizeof(ks->names[j]),
160 "btrfs-%s-%02d", ks->name_stem, j);
161 }
162}
163
164void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
165 int level)
166{
167 struct btrfs_lockdep_keyset *ks;
168
169 BUG_ON(level >= ARRAY_SIZE(ks->keys));
170
171 /* find the matching keyset, id 0 is the default entry */
172 for (ks = btrfs_lockdep_keysets; ks->id; ks++)
173 if (ks->id == objectid)
174 break;
175
176 lockdep_set_class_and_name(&eb->lock,
177 &ks->keys[level], ks->names[level]);
178}
179
135#endif 180#endif
136 181
137/* 182/*
@@ -217,7 +262,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
217 unsigned long len; 262 unsigned long len;
218 unsigned long cur_len; 263 unsigned long cur_len;
219 unsigned long offset = BTRFS_CSUM_SIZE; 264 unsigned long offset = BTRFS_CSUM_SIZE;
220 char *map_token = NULL;
221 char *kaddr; 265 char *kaddr;
222 unsigned long map_start; 266 unsigned long map_start;
223 unsigned long map_len; 267 unsigned long map_len;
@@ -228,8 +272,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
228 len = buf->len - offset; 272 len = buf->len - offset;
229 while (len > 0) { 273 while (len > 0) {
230 err = map_private_extent_buffer(buf, offset, 32, 274 err = map_private_extent_buffer(buf, offset, 32,
231 &map_token, &kaddr, 275 &kaddr, &map_start, &map_len);
232 &map_start, &map_len, KM_USER0);
233 if (err) 276 if (err)
234 return 1; 277 return 1;
235 cur_len = min(len, map_len - (offset - map_start)); 278 cur_len = min(len, map_len - (offset - map_start));
@@ -237,7 +280,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
237 crc, cur_len); 280 crc, cur_len);
238 len -= cur_len; 281 len -= cur_len;
239 offset += cur_len; 282 offset += cur_len;
240 unmap_extent_buffer(buf, map_token, KM_USER0);
241 } 283 }
242 if (csum_size > sizeof(inline_result)) { 284 if (csum_size > sizeof(inline_result)) {
243 result = kzalloc(csum_size * sizeof(char), GFP_NOFS); 285 result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
@@ -494,15 +536,6 @@ static noinline int check_leaf(struct btrfs_root *root,
494 return 0; 536 return 0;
495} 537}
496 538
497#ifdef CONFIG_DEBUG_LOCK_ALLOC
498void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
499{
500 lockdep_set_class_and_name(&eb->lock,
501 &btrfs_eb_class[level],
502 btrfs_eb_name[level]);
503}
504#endif
505
506static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, 539static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
507 struct extent_state *state) 540 struct extent_state *state)
508{ 541{
@@ -553,7 +586,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
553 } 586 }
554 found_level = btrfs_header_level(eb); 587 found_level = btrfs_header_level(eb);
555 588
556 btrfs_set_buffer_lockdep_class(eb, found_level); 589 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
590 eb, found_level);
557 591
558 ret = csum_tree_block(root, eb, 1); 592 ret = csum_tree_block(root, eb, 1);
559 if (ret) { 593 if (ret) {
@@ -1598,7 +1632,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1598 goto fail_bdi; 1632 goto fail_bdi;
1599 } 1633 }
1600 1634
1601 fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS; 1635 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
1602 1636
1603 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 1637 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1604 INIT_LIST_HEAD(&fs_info->trans_list); 1638 INIT_LIST_HEAD(&fs_info->trans_list);
@@ -1802,6 +1836,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1802 fs_info->thread_pool_size), 1836 fs_info->thread_pool_size),
1803 &fs_info->generic_worker); 1837 &fs_info->generic_worker);
1804 1838
1839 btrfs_init_workers(&fs_info->caching_workers, "cache",
1840 2, &fs_info->generic_worker);
1841
1805 /* a higher idle thresh on the submit workers makes it much more 1842 /* a higher idle thresh on the submit workers makes it much more
1806 * likely that bios will be send down in a sane order to the 1843 * likely that bios will be send down in a sane order to the
1807 * devices 1844 * devices
@@ -1855,6 +1892,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1855 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1892 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1856 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 1893 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1857 btrfs_start_workers(&fs_info->delayed_workers, 1); 1894 btrfs_start_workers(&fs_info->delayed_workers, 1);
1895 btrfs_start_workers(&fs_info->caching_workers, 1);
1858 1896
1859 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1897 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1860 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1898 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2112,6 +2150,7 @@ fail_sb_buffer:
2112 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2150 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2113 btrfs_stop_workers(&fs_info->submit_workers); 2151 btrfs_stop_workers(&fs_info->submit_workers);
2114 btrfs_stop_workers(&fs_info->delayed_workers); 2152 btrfs_stop_workers(&fs_info->delayed_workers);
2153 btrfs_stop_workers(&fs_info->caching_workers);
2115fail_alloc: 2154fail_alloc:
2116 kfree(fs_info->delayed_root); 2155 kfree(fs_info->delayed_root);
2117fail_iput: 2156fail_iput:
@@ -2577,6 +2616,7 @@ int close_ctree(struct btrfs_root *root)
2577 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2616 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2578 btrfs_stop_workers(&fs_info->submit_workers); 2617 btrfs_stop_workers(&fs_info->submit_workers);
2579 btrfs_stop_workers(&fs_info->delayed_workers); 2618 btrfs_stop_workers(&fs_info->delayed_workers);
2619 btrfs_stop_workers(&fs_info->caching_workers);
2580 2620
2581 btrfs_close_devices(fs_info->fs_devices); 2621 btrfs_close_devices(fs_info->fs_devices);
2582 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2622 btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a0b610a67aa..bec3ea4bd67 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,10 +87,14 @@ int btree_lock_page_hook(struct page *page);
87 87
88 88
89#ifdef CONFIG_DEBUG_LOCK_ALLOC 89#ifdef CONFIG_DEBUG_LOCK_ALLOC
90void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level); 90void btrfs_init_lockdep(void);
91void btrfs_set_buffer_lockdep_class(u64 objectid,
92 struct extent_buffer *eb, int level);
91#else 93#else
92static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, 94static inline void btrfs_init_lockdep(void)
93 int level) 95{ }
96static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
97 struct extent_buffer *eb, int level)
94{ 98{
95} 99}
96#endif 100#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71cd456fdb6..4d08ed79405 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -320,12 +320,12 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
320 return total_added; 320 return total_added;
321} 321}
322 322
323static int caching_kthread(void *data) 323static noinline void caching_thread(struct btrfs_work *work)
324{ 324{
325 struct btrfs_block_group_cache *block_group = data; 325 struct btrfs_block_group_cache *block_group;
326 struct btrfs_fs_info *fs_info = block_group->fs_info; 326 struct btrfs_fs_info *fs_info;
327 struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; 327 struct btrfs_caching_control *caching_ctl;
328 struct btrfs_root *extent_root = fs_info->extent_root; 328 struct btrfs_root *extent_root;
329 struct btrfs_path *path; 329 struct btrfs_path *path;
330 struct extent_buffer *leaf; 330 struct extent_buffer *leaf;
331 struct btrfs_key key; 331 struct btrfs_key key;
@@ -334,9 +334,14 @@ static int caching_kthread(void *data)
334 u32 nritems; 334 u32 nritems;
335 int ret = 0; 335 int ret = 0;
336 336
337 caching_ctl = container_of(work, struct btrfs_caching_control, work);
338 block_group = caching_ctl->block_group;
339 fs_info = block_group->fs_info;
340 extent_root = fs_info->extent_root;
341
337 path = btrfs_alloc_path(); 342 path = btrfs_alloc_path();
338 if (!path) 343 if (!path)
339 return -ENOMEM; 344 goto out;
340 345
341 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 346 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
342 347
@@ -433,13 +438,11 @@ err:
433 free_excluded_extents(extent_root, block_group); 438 free_excluded_extents(extent_root, block_group);
434 439
435 mutex_unlock(&caching_ctl->mutex); 440 mutex_unlock(&caching_ctl->mutex);
441out:
436 wake_up(&caching_ctl->wait); 442 wake_up(&caching_ctl->wait);
437 443
438 put_caching_control(caching_ctl); 444 put_caching_control(caching_ctl);
439 atomic_dec(&block_group->space_info->caching_threads);
440 btrfs_put_block_group(block_group); 445 btrfs_put_block_group(block_group);
441
442 return 0;
443} 446}
444 447
445static int cache_block_group(struct btrfs_block_group_cache *cache, 448static int cache_block_group(struct btrfs_block_group_cache *cache,
@@ -449,7 +452,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
449{ 452{
450 struct btrfs_fs_info *fs_info = cache->fs_info; 453 struct btrfs_fs_info *fs_info = cache->fs_info;
451 struct btrfs_caching_control *caching_ctl; 454 struct btrfs_caching_control *caching_ctl;
452 struct task_struct *tsk;
453 int ret = 0; 455 int ret = 0;
454 456
455 smp_mb(); 457 smp_mb();
@@ -501,6 +503,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
501 caching_ctl->progress = cache->key.objectid; 503 caching_ctl->progress = cache->key.objectid;
502 /* one for caching kthread, one for caching block group list */ 504 /* one for caching kthread, one for caching block group list */
503 atomic_set(&caching_ctl->count, 2); 505 atomic_set(&caching_ctl->count, 2);
506 caching_ctl->work.func = caching_thread;
504 507
505 spin_lock(&cache->lock); 508 spin_lock(&cache->lock);
506 if (cache->cached != BTRFS_CACHE_NO) { 509 if (cache->cached != BTRFS_CACHE_NO) {
@@ -516,16 +519,9 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
516 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 519 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
517 up_write(&fs_info->extent_commit_sem); 520 up_write(&fs_info->extent_commit_sem);
518 521
519 atomic_inc(&cache->space_info->caching_threads);
520 btrfs_get_block_group(cache); 522 btrfs_get_block_group(cache);
521 523
522 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", 524 btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
523 cache->key.objectid);
524 if (IS_ERR(tsk)) {
525 ret = PTR_ERR(tsk);
526 printk(KERN_ERR "error running thread %d\n", ret);
527 BUG();
528 }
529 525
530 return ret; 526 return ret;
531} 527}
@@ -2932,9 +2928,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2932 found->full = 0; 2928 found->full = 0;
2933 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 2929 found->force_alloc = CHUNK_ALLOC_NO_FORCE;
2934 found->chunk_alloc = 0; 2930 found->chunk_alloc = 0;
2931 found->flush = 0;
2932 init_waitqueue_head(&found->wait);
2935 *space_info = found; 2933 *space_info = found;
2936 list_add_rcu(&found->list, &info->space_info); 2934 list_add_rcu(&found->list, &info->space_info);
2937 atomic_set(&found->caching_threads, 0);
2938 return 0; 2935 return 0;
2939} 2936}
2940 2937
@@ -3314,6 +3311,14 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3314 if (reserved == 0) 3311 if (reserved == 0)
3315 return 0; 3312 return 0;
3316 3313
3314 smp_mb();
3315 if (root->fs_info->delalloc_bytes == 0) {
3316 if (trans)
3317 return 0;
3318 btrfs_wait_ordered_extents(root, 0, 0);
3319 return 0;
3320 }
3321
3317 max_reclaim = min(reserved, to_reclaim); 3322 max_reclaim = min(reserved, to_reclaim);
3318 3323
3319 while (loops < 1024) { 3324 while (loops < 1024) {
@@ -3356,6 +3361,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3356 } 3361 }
3357 3362
3358 } 3363 }
3364 if (reclaimed >= to_reclaim && !trans)
3365 btrfs_wait_ordered_extents(root, 0, 0);
3359 return reclaimed >= to_reclaim; 3366 return reclaimed >= to_reclaim;
3360} 3367}
3361 3368
@@ -3380,15 +3387,36 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
3380 u64 num_bytes = orig_bytes; 3387 u64 num_bytes = orig_bytes;
3381 int retries = 0; 3388 int retries = 0;
3382 int ret = 0; 3389 int ret = 0;
3383 bool reserved = false;
3384 bool committed = false; 3390 bool committed = false;
3391 bool flushing = false;
3385 3392
3386again: 3393again:
3387 ret = -ENOSPC; 3394 ret = 0;
3388 if (reserved)
3389 num_bytes = 0;
3390
3391 spin_lock(&space_info->lock); 3395 spin_lock(&space_info->lock);
3396 /*
3397 * We only want to wait if somebody other than us is flushing and we are
3398 * actually alloed to flush.
3399 */
3400 while (flush && !flushing && space_info->flush) {
3401 spin_unlock(&space_info->lock);
3402 /*
3403 * If we have a trans handle we can't wait because the flusher
3404 * may have to commit the transaction, which would mean we would
3405 * deadlock since we are waiting for the flusher to finish, but
3406 * hold the current transaction open.
3407 */
3408 if (trans)
3409 return -EAGAIN;
3410 ret = wait_event_interruptible(space_info->wait,
3411 !space_info->flush);
3412 /* Must have been interrupted, return */
3413 if (ret)
3414 return -EINTR;
3415
3416 spin_lock(&space_info->lock);
3417 }
3418
3419 ret = -ENOSPC;
3392 unused = space_info->bytes_used + space_info->bytes_reserved + 3420 unused = space_info->bytes_used + space_info->bytes_reserved +
3393 space_info->bytes_pinned + space_info->bytes_readonly + 3421 space_info->bytes_pinned + space_info->bytes_readonly +
3394 space_info->bytes_may_use; 3422 space_info->bytes_may_use;
@@ -3403,8 +3431,7 @@ again:
3403 if (unused <= space_info->total_bytes) { 3431 if (unused <= space_info->total_bytes) {
3404 unused = space_info->total_bytes - unused; 3432 unused = space_info->total_bytes - unused;
3405 if (unused >= num_bytes) { 3433 if (unused >= num_bytes) {
3406 if (!reserved) 3434 space_info->bytes_reserved += orig_bytes;
3407 space_info->bytes_reserved += orig_bytes;
3408 ret = 0; 3435 ret = 0;
3409 } else { 3436 } else {
3410 /* 3437 /*
@@ -3429,17 +3456,14 @@ again:
3429 * to reclaim space we can actually use it instead of somebody else 3456 * to reclaim space we can actually use it instead of somebody else
3430 * stealing it from us. 3457 * stealing it from us.
3431 */ 3458 */
3432 if (ret && !reserved) { 3459 if (ret && flush) {
3433 space_info->bytes_reserved += orig_bytes; 3460 flushing = true;
3434 reserved = true; 3461 space_info->flush = 1;
3435 } 3462 }
3436 3463
3437 spin_unlock(&space_info->lock); 3464 spin_unlock(&space_info->lock);
3438 3465
3439 if (!ret) 3466 if (!ret || !flush)
3440 return 0;
3441
3442 if (!flush)
3443 goto out; 3467 goto out;
3444 3468
3445 /* 3469 /*
@@ -3447,11 +3471,11 @@ again:
3447 * metadata until after the IO is completed. 3471 * metadata until after the IO is completed.
3448 */ 3472 */
3449 ret = shrink_delalloc(trans, root, num_bytes, 1); 3473 ret = shrink_delalloc(trans, root, num_bytes, 1);
3450 if (ret > 0) 3474 if (ret < 0)
3451 return 0;
3452 else if (ret < 0)
3453 goto out; 3475 goto out;
3454 3476
3477 ret = 0;
3478
3455 /* 3479 /*
3456 * So if we were overcommitted it's possible that somebody else flushed 3480 * So if we were overcommitted it's possible that somebody else flushed
3457 * out enough space and we simply didn't have enough space to reclaim, 3481 * out enough space and we simply didn't have enough space to reclaim,
@@ -3462,11 +3486,11 @@ again:
3462 goto again; 3486 goto again;
3463 } 3487 }
3464 3488
3465 spin_lock(&space_info->lock);
3466 /* 3489 /*
3467 * Not enough space to be reclaimed, don't bother committing the 3490 * Not enough space to be reclaimed, don't bother committing the
3468 * transaction. 3491 * transaction.
3469 */ 3492 */
3493 spin_lock(&space_info->lock);
3470 if (space_info->bytes_pinned < orig_bytes) 3494 if (space_info->bytes_pinned < orig_bytes)
3471 ret = -ENOSPC; 3495 ret = -ENOSPC;
3472 spin_unlock(&space_info->lock); 3496 spin_unlock(&space_info->lock);
@@ -3474,10 +3498,13 @@ again:
3474 goto out; 3498 goto out;
3475 3499
3476 ret = -EAGAIN; 3500 ret = -EAGAIN;
3477 if (trans || committed) 3501 if (trans)
3478 goto out; 3502 goto out;
3479 3503
3480 ret = -ENOSPC; 3504 ret = -ENOSPC;
3505 if (committed)
3506 goto out;
3507
3481 trans = btrfs_join_transaction(root); 3508 trans = btrfs_join_transaction(root);
3482 if (IS_ERR(trans)) 3509 if (IS_ERR(trans))
3483 goto out; 3510 goto out;
@@ -3489,12 +3516,12 @@ again:
3489 } 3516 }
3490 3517
3491out: 3518out:
3492 if (reserved) { 3519 if (flushing) {
3493 spin_lock(&space_info->lock); 3520 spin_lock(&space_info->lock);
3494 space_info->bytes_reserved -= orig_bytes; 3521 space_info->flush = 0;
3522 wake_up_all(&space_info->wait);
3495 spin_unlock(&space_info->lock); 3523 spin_unlock(&space_info->lock);
3496 } 3524 }
3497
3498 return ret; 3525 return ret;
3499} 3526}
3500 3527
@@ -3704,7 +3731,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3704 if (commit_trans) { 3731 if (commit_trans) {
3705 if (trans) 3732 if (trans)
3706 return -EAGAIN; 3733 return -EAGAIN;
3707
3708 trans = btrfs_join_transaction(root); 3734 trans = btrfs_join_transaction(root);
3709 BUG_ON(IS_ERR(trans)); 3735 BUG_ON(IS_ERR(trans));
3710 ret = btrfs_commit_transaction(trans, root); 3736 ret = btrfs_commit_transaction(trans, root);
@@ -3874,26 +3900,6 @@ int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3874 return 0; 3900 return 0;
3875} 3901}
3876 3902
3877int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3878 struct btrfs_root *root,
3879 int num_items)
3880{
3881 u64 num_bytes;
3882 int ret;
3883
3884 if (num_items == 0 || root->fs_info->chunk_root == root)
3885 return 0;
3886
3887 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
3888 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3889 num_bytes);
3890 if (!ret) {
3891 trans->bytes_reserved += num_bytes;
3892 trans->block_rsv = &root->fs_info->trans_block_rsv;
3893 }
3894 return ret;
3895}
3896
3897void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3903void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3898 struct btrfs_root *root) 3904 struct btrfs_root *root)
3899{ 3905{
@@ -3944,6 +3950,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3944 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 3950 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3945} 3951}
3946 3952
3953static unsigned drop_outstanding_extent(struct inode *inode)
3954{
3955 unsigned dropped_extents = 0;
3956
3957 spin_lock(&BTRFS_I(inode)->lock);
3958 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
3959 BTRFS_I(inode)->outstanding_extents--;
3960
3961 /*
3962 * If we have more or the same amount of outsanding extents than we have
3963 * reserved then we need to leave the reserved extents count alone.
3964 */
3965 if (BTRFS_I(inode)->outstanding_extents >=
3966 BTRFS_I(inode)->reserved_extents)
3967 goto out;
3968
3969 dropped_extents = BTRFS_I(inode)->reserved_extents -
3970 BTRFS_I(inode)->outstanding_extents;
3971 BTRFS_I(inode)->reserved_extents -= dropped_extents;
3972out:
3973 spin_unlock(&BTRFS_I(inode)->lock);
3974 return dropped_extents;
3975}
3976
3947static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) 3977static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3948{ 3978{
3949 return num_bytes >>= 3; 3979 return num_bytes >>= 3;
@@ -3953,9 +3983,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3953{ 3983{
3954 struct btrfs_root *root = BTRFS_I(inode)->root; 3984 struct btrfs_root *root = BTRFS_I(inode)->root;
3955 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 3985 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3956 u64 to_reserve; 3986 u64 to_reserve = 0;
3957 int nr_extents; 3987 unsigned nr_extents = 0;
3958 int reserved_extents;
3959 int ret; 3988 int ret;
3960 3989
3961 if (btrfs_transaction_in_commit(root->fs_info)) 3990 if (btrfs_transaction_in_commit(root->fs_info))
@@ -3963,66 +3992,49 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3963 3992
3964 num_bytes = ALIGN(num_bytes, root->sectorsize); 3993 num_bytes = ALIGN(num_bytes, root->sectorsize);
3965 3994
3966 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; 3995 spin_lock(&BTRFS_I(inode)->lock);
3967 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); 3996 BTRFS_I(inode)->outstanding_extents++;
3997
3998 if (BTRFS_I(inode)->outstanding_extents >
3999 BTRFS_I(inode)->reserved_extents) {
4000 nr_extents = BTRFS_I(inode)->outstanding_extents -
4001 BTRFS_I(inode)->reserved_extents;
4002 BTRFS_I(inode)->reserved_extents += nr_extents;
3968 4003
3969 if (nr_extents > reserved_extents) {
3970 nr_extents -= reserved_extents;
3971 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4004 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
3972 } else {
3973 nr_extents = 0;
3974 to_reserve = 0;
3975 } 4005 }
4006 spin_unlock(&BTRFS_I(inode)->lock);
3976 4007
3977 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4008 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3978 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); 4009 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
3979 if (ret) 4010 if (ret) {
4011 unsigned dropped;
4012 /*
4013 * We don't need the return value since our reservation failed,
4014 * we just need to clean up our counter.
4015 */
4016 dropped = drop_outstanding_extent(inode);
4017 WARN_ON(dropped > 1);
3980 return ret; 4018 return ret;
3981 4019 }
3982 atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
3983 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3984 4020
3985 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4021 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3986 4022
3987 if (block_rsv->size > 512 * 1024 * 1024)
3988 shrink_delalloc(NULL, root, to_reserve, 0);
3989
3990 return 0; 4023 return 0;
3991} 4024}
3992 4025
3993void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4026void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3994{ 4027{
3995 struct btrfs_root *root = BTRFS_I(inode)->root; 4028 struct btrfs_root *root = BTRFS_I(inode)->root;
3996 u64 to_free; 4029 u64 to_free = 0;
3997 int nr_extents; 4030 unsigned dropped;
3998 int reserved_extents;
3999 4031
4000 num_bytes = ALIGN(num_bytes, root->sectorsize); 4032 num_bytes = ALIGN(num_bytes, root->sectorsize);
4001 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 4033 dropped = drop_outstanding_extent(inode);
4002 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
4003
4004 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
4005 do {
4006 int old, new;
4007
4008 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
4009 if (nr_extents >= reserved_extents) {
4010 nr_extents = 0;
4011 break;
4012 }
4013 old = reserved_extents;
4014 nr_extents = reserved_extents - nr_extents;
4015 new = reserved_extents - nr_extents;
4016 old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
4017 reserved_extents, new);
4018 if (likely(old == reserved_extents))
4019 break;
4020 reserved_extents = old;
4021 } while (1);
4022 4034
4023 to_free = calc_csum_metadata_size(inode, num_bytes); 4035 to_free = calc_csum_metadata_size(inode, num_bytes);
4024 if (nr_extents > 0) 4036 if (dropped > 0)
4025 to_free += btrfs_calc_trans_metadata_size(root, nr_extents); 4037 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4026 4038
4027 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4039 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4028 to_free); 4040 to_free);
@@ -4990,14 +5002,10 @@ have_block_group:
4990 } 5002 }
4991 5003
4992 /* 5004 /*
4993 * We only want to start kthread caching if we are at 5005 * The caching workers are limited to 2 threads, so we
4994 * the point where we will wait for caching to make 5006 * can queue as much work as we care to.
4995 * progress, or if our ideal search is over and we've
4996 * found somebody to start caching.
4997 */ 5007 */
4998 if (loop > LOOP_CACHING_NOWAIT || 5008 if (loop > LOOP_FIND_IDEAL) {
4999 (loop > LOOP_FIND_IDEAL &&
5000 atomic_read(&space_info->caching_threads) < 2)) {
5001 ret = cache_block_group(block_group, trans, 5009 ret = cache_block_group(block_group, trans,
5002 orig_root, 0); 5010 orig_root, 0);
5003 BUG_ON(ret); 5011 BUG_ON(ret);
@@ -5219,8 +5227,7 @@ loop:
5219 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 5227 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
5220 found_uncached_bg = false; 5228 found_uncached_bg = false;
5221 loop++; 5229 loop++;
5222 if (!ideal_cache_percent && 5230 if (!ideal_cache_percent)
5223 atomic_read(&space_info->caching_threads))
5224 goto search; 5231 goto search;
5225 5232
5226 /* 5233 /*
@@ -5623,7 +5630,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
5623 if (!buf) 5630 if (!buf)
5624 return ERR_PTR(-ENOMEM); 5631 return ERR_PTR(-ENOMEM);
5625 btrfs_set_header_generation(buf, trans->transid); 5632 btrfs_set_header_generation(buf, trans->transid);
5626 btrfs_set_buffer_lockdep_class(buf, level); 5633 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
5627 btrfs_tree_lock(buf); 5634 btrfs_tree_lock(buf);
5628 clean_tree_block(trans, root, buf); 5635 clean_tree_block(trans, root, buf);
5629 5636
@@ -5910,7 +5917,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5910 return 1; 5917 return 1;
5911 5918
5912 if (path->locks[level] && !wc->keep_locks) { 5919 if (path->locks[level] && !wc->keep_locks) {
5913 btrfs_tree_unlock(eb); 5920 btrfs_tree_unlock_rw(eb, path->locks[level]);
5914 path->locks[level] = 0; 5921 path->locks[level] = 0;
5915 } 5922 }
5916 return 0; 5923 return 0;
@@ -5934,7 +5941,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5934 * keep the tree lock 5941 * keep the tree lock
5935 */ 5942 */
5936 if (path->locks[level] && level > 0) { 5943 if (path->locks[level] && level > 0) {
5937 btrfs_tree_unlock(eb); 5944 btrfs_tree_unlock_rw(eb, path->locks[level]);
5938 path->locks[level] = 0; 5945 path->locks[level] = 0;
5939 } 5946 }
5940 return 0; 5947 return 0;
@@ -6047,7 +6054,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6047 BUG_ON(level != btrfs_header_level(next)); 6054 BUG_ON(level != btrfs_header_level(next));
6048 path->nodes[level] = next; 6055 path->nodes[level] = next;
6049 path->slots[level] = 0; 6056 path->slots[level] = 0;
6050 path->locks[level] = 1; 6057 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6051 wc->level = level; 6058 wc->level = level;
6052 if (wc->level == 1) 6059 if (wc->level == 1)
6053 wc->reada_slot = 0; 6060 wc->reada_slot = 0;
@@ -6118,7 +6125,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6118 BUG_ON(level == 0); 6125 BUG_ON(level == 0);
6119 btrfs_tree_lock(eb); 6126 btrfs_tree_lock(eb);
6120 btrfs_set_lock_blocking(eb); 6127 btrfs_set_lock_blocking(eb);
6121 path->locks[level] = 1; 6128 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6122 6129
6123 ret = btrfs_lookup_extent_info(trans, root, 6130 ret = btrfs_lookup_extent_info(trans, root,
6124 eb->start, eb->len, 6131 eb->start, eb->len,
@@ -6127,8 +6134,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6127 BUG_ON(ret); 6134 BUG_ON(ret);
6128 BUG_ON(wc->refs[level] == 0); 6135 BUG_ON(wc->refs[level] == 0);
6129 if (wc->refs[level] == 1) { 6136 if (wc->refs[level] == 1) {
6130 btrfs_tree_unlock(eb); 6137 btrfs_tree_unlock_rw(eb, path->locks[level]);
6131 path->locks[level] = 0;
6132 return 1; 6138 return 1;
6133 } 6139 }
6134 } 6140 }
@@ -6150,7 +6156,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6150 btrfs_header_generation(eb) == trans->transid) { 6156 btrfs_header_generation(eb) == trans->transid) {
6151 btrfs_tree_lock(eb); 6157 btrfs_tree_lock(eb);
6152 btrfs_set_lock_blocking(eb); 6158 btrfs_set_lock_blocking(eb);
6153 path->locks[level] = 1; 6159 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6154 } 6160 }
6155 clean_tree_block(trans, root, eb); 6161 clean_tree_block(trans, root, eb);
6156 } 6162 }
@@ -6229,7 +6235,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6229 return 0; 6235 return 0;
6230 6236
6231 if (path->locks[level]) { 6237 if (path->locks[level]) {
6232 btrfs_tree_unlock(path->nodes[level]); 6238 btrfs_tree_unlock_rw(path->nodes[level],
6239 path->locks[level]);
6233 path->locks[level] = 0; 6240 path->locks[level] = 0;
6234 } 6241 }
6235 free_extent_buffer(path->nodes[level]); 6242 free_extent_buffer(path->nodes[level]);
@@ -6281,7 +6288,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6281 path->nodes[level] = btrfs_lock_root_node(root); 6288 path->nodes[level] = btrfs_lock_root_node(root);
6282 btrfs_set_lock_blocking(path->nodes[level]); 6289 btrfs_set_lock_blocking(path->nodes[level]);
6283 path->slots[level] = 0; 6290 path->slots[level] = 0;
6284 path->locks[level] = 1; 6291 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6285 memset(&wc->update_progress, 0, 6292 memset(&wc->update_progress, 0,
6286 sizeof(wc->update_progress)); 6293 sizeof(wc->update_progress));
6287 } else { 6294 } else {
@@ -6449,7 +6456,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6449 level = btrfs_header_level(node); 6456 level = btrfs_header_level(node);
6450 path->nodes[level] = node; 6457 path->nodes[level] = node;
6451 path->slots[level] = 0; 6458 path->slots[level] = 0;
6452 path->locks[level] = 1; 6459 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6453 6460
6454 wc->refs[parent_level] = 1; 6461 wc->refs[parent_level] = 1;
6455 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 6462 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -6524,15 +6531,28 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
6524 return flags; 6531 return flags;
6525} 6532}
6526 6533
6527static int set_block_group_ro(struct btrfs_block_group_cache *cache) 6534static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6528{ 6535{
6529 struct btrfs_space_info *sinfo = cache->space_info; 6536 struct btrfs_space_info *sinfo = cache->space_info;
6530 u64 num_bytes; 6537 u64 num_bytes;
6538 u64 min_allocable_bytes;
6531 int ret = -ENOSPC; 6539 int ret = -ENOSPC;
6532 6540
6533 if (cache->ro) 6541 if (cache->ro)
6534 return 0; 6542 return 0;
6535 6543
6544 /*
6545 * We need some metadata space and system metadata space for
6546 * allocating chunks in some corner cases until we force to set
6547 * it to be readonly.
6548 */
6549 if ((sinfo->flags &
6550 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
6551 !force)
6552 min_allocable_bytes = 1 * 1024 * 1024;
6553 else
6554 min_allocable_bytes = 0;
6555
6536 spin_lock(&sinfo->lock); 6556 spin_lock(&sinfo->lock);
6537 spin_lock(&cache->lock); 6557 spin_lock(&cache->lock);
6538 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 6558 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
@@ -6540,7 +6560,8 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
6540 6560
6541 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 6561 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6542 sinfo->bytes_may_use + sinfo->bytes_readonly + 6562 sinfo->bytes_may_use + sinfo->bytes_readonly +
6543 cache->reserved_pinned + num_bytes <= sinfo->total_bytes) { 6563 cache->reserved_pinned + num_bytes + min_allocable_bytes <=
6564 sinfo->total_bytes) {
6544 sinfo->bytes_readonly += num_bytes; 6565 sinfo->bytes_readonly += num_bytes;
6545 sinfo->bytes_reserved += cache->reserved_pinned; 6566 sinfo->bytes_reserved += cache->reserved_pinned;
6546 cache->reserved_pinned = 0; 6567 cache->reserved_pinned = 0;
@@ -6571,7 +6592,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
6571 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 6592 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6572 CHUNK_ALLOC_FORCE); 6593 CHUNK_ALLOC_FORCE);
6573 6594
6574 ret = set_block_group_ro(cache); 6595 ret = set_block_group_ro(cache, 0);
6575 if (!ret) 6596 if (!ret)
6576 goto out; 6597 goto out;
6577 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 6598 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -6579,7 +6600,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
6579 CHUNK_ALLOC_FORCE); 6600 CHUNK_ALLOC_FORCE);
6580 if (ret < 0) 6601 if (ret < 0)
6581 goto out; 6602 goto out;
6582 ret = set_block_group_ro(cache); 6603 ret = set_block_group_ro(cache, 0);
6583out: 6604out:
6584 btrfs_end_transaction(trans, root); 6605 btrfs_end_transaction(trans, root);
6585 return ret; 6606 return ret;
@@ -7016,7 +7037,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7016 7037
7017 set_avail_alloc_bits(root->fs_info, cache->flags); 7038 set_avail_alloc_bits(root->fs_info, cache->flags);
7018 if (btrfs_chunk_readonly(root, cache->key.objectid)) 7039 if (btrfs_chunk_readonly(root, cache->key.objectid))
7019 set_block_group_ro(cache); 7040 set_block_group_ro(cache, 1);
7020 } 7041 }
7021 7042
7022 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 7043 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -7030,9 +7051,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7030 * mirrored block groups. 7051 * mirrored block groups.
7031 */ 7052 */
7032 list_for_each_entry(cache, &space_info->block_groups[3], list) 7053 list_for_each_entry(cache, &space_info->block_groups[3], list)
7033 set_block_group_ro(cache); 7054 set_block_group_ro(cache, 1);
7034 list_for_each_entry(cache, &space_info->block_groups[4], list) 7055 list_for_each_entry(cache, &space_info->block_groups[4], list)
7035 set_block_group_ro(cache); 7056 set_block_group_ro(cache, 1);
7036 } 7057 }
7037 7058
7038 init_global_block_rsv(info); 7059 init_global_block_rsv(info);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 561262d3568..067b1747421 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -281,11 +281,10 @@ static int merge_state(struct extent_io_tree *tree,
281 if (other->start == state->end + 1 && 281 if (other->start == state->end + 1 &&
282 other->state == state->state) { 282 other->state == state->state) {
283 merge_cb(tree, state, other); 283 merge_cb(tree, state, other);
284 other->start = state->start; 284 state->end = other->end;
285 state->tree = NULL; 285 other->tree = NULL;
286 rb_erase(&state->rb_node, &tree->state); 286 rb_erase(&other->rb_node, &tree->state);
287 free_extent_state(state); 287 free_extent_state(other);
288 state = NULL;
289 } 288 }
290 } 289 }
291 290
@@ -351,7 +350,6 @@ static int insert_state(struct extent_io_tree *tree,
351 "%llu %llu\n", (unsigned long long)found->start, 350 "%llu %llu\n", (unsigned long long)found->start,
352 (unsigned long long)found->end, 351 (unsigned long long)found->end,
353 (unsigned long long)start, (unsigned long long)end); 352 (unsigned long long)start, (unsigned long long)end);
354 free_extent_state(state);
355 return -EEXIST; 353 return -EEXIST;
356 } 354 }
357 state->tree = tree; 355 state->tree = tree;
@@ -500,7 +498,8 @@ again:
500 cached_state = NULL; 498 cached_state = NULL;
501 } 499 }
502 500
503 if (cached && cached->tree && cached->start == start) { 501 if (cached && cached->tree && cached->start <= start &&
502 cached->end > start) {
504 if (clear) 503 if (clear)
505 atomic_dec(&cached->refs); 504 atomic_dec(&cached->refs);
506 state = cached; 505 state = cached;
@@ -742,7 +741,8 @@ again:
742 spin_lock(&tree->lock); 741 spin_lock(&tree->lock);
743 if (cached_state && *cached_state) { 742 if (cached_state && *cached_state) {
744 state = *cached_state; 743 state = *cached_state;
745 if (state->start == start && state->tree) { 744 if (state->start <= start && state->end > start &&
745 state->tree) {
746 node = &state->rb_node; 746 node = &state->rb_node;
747 goto hit_next; 747 goto hit_next;
748 } 748 }
@@ -783,13 +783,13 @@ hit_next:
783 if (err) 783 if (err)
784 goto out; 784 goto out;
785 785
786 next_node = rb_next(node);
787 cache_state(state, cached_state); 786 cache_state(state, cached_state);
788 merge_state(tree, state); 787 merge_state(tree, state);
789 if (last_end == (u64)-1) 788 if (last_end == (u64)-1)
790 goto out; 789 goto out;
791 790
792 start = last_end + 1; 791 start = last_end + 1;
792 next_node = rb_next(&state->rb_node);
793 if (next_node && start < end && prealloc && !need_resched()) { 793 if (next_node && start < end && prealloc && !need_resched()) {
794 state = rb_entry(next_node, struct extent_state, 794 state = rb_entry(next_node, struct extent_state,
795 rb_node); 795 rb_node);
@@ -862,7 +862,6 @@ hit_next:
862 * Avoid to free 'prealloc' if it can be merged with 862 * Avoid to free 'prealloc' if it can be merged with
863 * the later extent. 863 * the later extent.
864 */ 864 */
865 atomic_inc(&prealloc->refs);
866 err = insert_state(tree, prealloc, start, this_end, 865 err = insert_state(tree, prealloc, start, this_end,
867 &bits); 866 &bits);
868 BUG_ON(err == -EEXIST); 867 BUG_ON(err == -EEXIST);
@@ -872,7 +871,6 @@ hit_next:
872 goto out; 871 goto out;
873 } 872 }
874 cache_state(prealloc, cached_state); 873 cache_state(prealloc, cached_state);
875 free_extent_state(prealloc);
876 prealloc = NULL; 874 prealloc = NULL;
877 start = this_end + 1; 875 start = this_end + 1;
878 goto search_again; 876 goto search_again;
@@ -1564,7 +1562,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1564 int bitset = 0; 1562 int bitset = 0;
1565 1563
1566 spin_lock(&tree->lock); 1564 spin_lock(&tree->lock);
1567 if (cached && cached->tree && cached->start == start) 1565 if (cached && cached->tree && cached->start <= start &&
1566 cached->end > start)
1568 node = &cached->rb_node; 1567 node = &cached->rb_node;
1569 else 1568 else
1570 node = tree_search(tree, start); 1569 node = tree_search(tree, start);
@@ -2432,6 +2431,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2432 pgoff_t index; 2431 pgoff_t index;
2433 pgoff_t end; /* Inclusive */ 2432 pgoff_t end; /* Inclusive */
2434 int scanned = 0; 2433 int scanned = 0;
2434 int tag;
2435 2435
2436 pagevec_init(&pvec, 0); 2436 pagevec_init(&pvec, 0);
2437 if (wbc->range_cyclic) { 2437 if (wbc->range_cyclic) {
@@ -2442,11 +2442,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2442 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2442 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2443 scanned = 1; 2443 scanned = 1;
2444 } 2444 }
2445 if (wbc->sync_mode == WB_SYNC_ALL)
2446 tag = PAGECACHE_TAG_TOWRITE;
2447 else
2448 tag = PAGECACHE_TAG_DIRTY;
2445retry: 2449retry:
2450 if (wbc->sync_mode == WB_SYNC_ALL)
2451 tag_pages_for_writeback(mapping, index, end);
2446 while (!done && !nr_to_write_done && (index <= end) && 2452 while (!done && !nr_to_write_done && (index <= end) &&
2447 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2453 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2448 PAGECACHE_TAG_DIRTY, min(end - index, 2454 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2449 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2450 unsigned i; 2455 unsigned i;
2451 2456
2452 scanned = 1; 2457 scanned = 1;
@@ -3020,8 +3025,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3020 return NULL; 3025 return NULL;
3021 eb->start = start; 3026 eb->start = start;
3022 eb->len = len; 3027 eb->len = len;
3023 spin_lock_init(&eb->lock); 3028 rwlock_init(&eb->lock);
3024 init_waitqueue_head(&eb->lock_wq); 3029 atomic_set(&eb->write_locks, 0);
3030 atomic_set(&eb->read_locks, 0);
3031 atomic_set(&eb->blocking_readers, 0);
3032 atomic_set(&eb->blocking_writers, 0);
3033 atomic_set(&eb->spinning_readers, 0);
3034 atomic_set(&eb->spinning_writers, 0);
3035 init_waitqueue_head(&eb->write_lock_wq);
3036 init_waitqueue_head(&eb->read_lock_wq);
3025 3037
3026#if LEAK_DEBUG 3038#if LEAK_DEBUG
3027 spin_lock_irqsave(&leak_lock, flags); 3039 spin_lock_irqsave(&leak_lock, flags);
@@ -3117,7 +3129,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3117 i = 0; 3129 i = 0;
3118 } 3130 }
3119 for (; i < num_pages; i++, index++) { 3131 for (; i < num_pages; i++, index++) {
3120 p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM); 3132 p = find_or_create_page(mapping, index, GFP_NOFS);
3121 if (!p) { 3133 if (!p) {
3122 WARN_ON(1); 3134 WARN_ON(1);
3123 goto free_eb; 3135 goto free_eb;
@@ -3264,6 +3276,22 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3264 return was_dirty; 3276 return was_dirty;
3265} 3277}
3266 3278
3279static int __eb_straddles_pages(u64 start, u64 len)
3280{
3281 if (len < PAGE_CACHE_SIZE)
3282 return 1;
3283 if (start & (PAGE_CACHE_SIZE - 1))
3284 return 1;
3285 if ((start + len) & (PAGE_CACHE_SIZE - 1))
3286 return 1;
3287 return 0;
3288}
3289
3290static int eb_straddles_pages(struct extent_buffer *eb)
3291{
3292 return __eb_straddles_pages(eb->start, eb->len);
3293}
3294
3267int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3295int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3268 struct extent_buffer *eb, 3296 struct extent_buffer *eb,
3269 struct extent_state **cached_state) 3297 struct extent_state **cached_state)
@@ -3275,8 +3303,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3275 num_pages = num_extent_pages(eb->start, eb->len); 3303 num_pages = num_extent_pages(eb->start, eb->len);
3276 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3304 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3277 3305
3278 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3306 if (eb_straddles_pages(eb)) {
3279 cached_state, GFP_NOFS); 3307 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3308 cached_state, GFP_NOFS);
3309 }
3280 for (i = 0; i < num_pages; i++) { 3310 for (i = 0; i < num_pages; i++) {
3281 page = extent_buffer_page(eb, i); 3311 page = extent_buffer_page(eb, i);
3282 if (page) 3312 if (page)
@@ -3294,8 +3324,10 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3294 3324
3295 num_pages = num_extent_pages(eb->start, eb->len); 3325 num_pages = num_extent_pages(eb->start, eb->len);
3296 3326
3297 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3327 if (eb_straddles_pages(eb)) {
3298 NULL, GFP_NOFS); 3328 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3329 NULL, GFP_NOFS);
3330 }
3299 for (i = 0; i < num_pages; i++) { 3331 for (i = 0; i < num_pages; i++) {
3300 page = extent_buffer_page(eb, i); 3332 page = extent_buffer_page(eb, i);
3301 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3333 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3318,9 +3350,12 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3318 int uptodate; 3350 int uptodate;
3319 unsigned long index; 3351 unsigned long index;
3320 3352
3321 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); 3353 if (__eb_straddles_pages(start, end - start + 1)) {
3322 if (ret) 3354 ret = test_range_bit(tree, start, end,
3323 return 1; 3355 EXTENT_UPTODATE, 1, NULL);
3356 if (ret)
3357 return 1;
3358 }
3324 while (start <= end) { 3359 while (start <= end) {
3325 index = start >> PAGE_CACHE_SHIFT; 3360 index = start >> PAGE_CACHE_SHIFT;
3326 page = find_get_page(tree->mapping, index); 3361 page = find_get_page(tree->mapping, index);
@@ -3348,10 +3383,12 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3348 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3383 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3349 return 1; 3384 return 1;
3350 3385
3351 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3386 if (eb_straddles_pages(eb)) {
3352 EXTENT_UPTODATE, 1, cached_state); 3387 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3353 if (ret) 3388 EXTENT_UPTODATE, 1, cached_state);
3354 return ret; 3389 if (ret)
3390 return ret;
3391 }
3355 3392
3356 num_pages = num_extent_pages(eb->start, eb->len); 3393 num_pages = num_extent_pages(eb->start, eb->len);
3357 for (i = 0; i < num_pages; i++) { 3394 for (i = 0; i < num_pages; i++) {
@@ -3384,9 +3421,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3384 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3421 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3385 return 0; 3422 return 0;
3386 3423
3387 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3424 if (eb_straddles_pages(eb)) {
3388 EXTENT_UPTODATE, 1, NULL)) { 3425 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3389 return 0; 3426 EXTENT_UPTODATE, 1, NULL)) {
3427 return 0;
3428 }
3390 } 3429 }
3391 3430
3392 if (start) { 3431 if (start) {
@@ -3490,9 +3529,8 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3490 page = extent_buffer_page(eb, i); 3529 page = extent_buffer_page(eb, i);
3491 3530
3492 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3531 cur = min(len, (PAGE_CACHE_SIZE - offset));
3493 kaddr = kmap_atomic(page, KM_USER1); 3532 kaddr = page_address(page);
3494 memcpy(dst, kaddr + offset, cur); 3533 memcpy(dst, kaddr + offset, cur);
3495 kunmap_atomic(kaddr, KM_USER1);
3496 3534
3497 dst += cur; 3535 dst += cur;
3498 len -= cur; 3536 len -= cur;
@@ -3502,9 +3540,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3502} 3540}
3503 3541
3504int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 3542int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3505 unsigned long min_len, char **token, char **map, 3543 unsigned long min_len, char **map,
3506 unsigned long *map_start, 3544 unsigned long *map_start,
3507 unsigned long *map_len, int km) 3545 unsigned long *map_len)
3508{ 3546{
3509 size_t offset = start & (PAGE_CACHE_SIZE - 1); 3547 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3510 char *kaddr; 3548 char *kaddr;
@@ -3534,42 +3572,12 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3534 } 3572 }
3535 3573
3536 p = extent_buffer_page(eb, i); 3574 p = extent_buffer_page(eb, i);
3537 kaddr = kmap_atomic(p, km); 3575 kaddr = page_address(p);
3538 *token = kaddr;
3539 *map = kaddr + offset; 3576 *map = kaddr + offset;
3540 *map_len = PAGE_CACHE_SIZE - offset; 3577 *map_len = PAGE_CACHE_SIZE - offset;
3541 return 0; 3578 return 0;
3542} 3579}
3543 3580
3544int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3545 unsigned long min_len,
3546 char **token, char **map,
3547 unsigned long *map_start,
3548 unsigned long *map_len, int km)
3549{
3550 int err;
3551 int save = 0;
3552 if (eb->map_token) {
3553 unmap_extent_buffer(eb, eb->map_token, km);
3554 eb->map_token = NULL;
3555 save = 1;
3556 }
3557 err = map_private_extent_buffer(eb, start, min_len, token, map,
3558 map_start, map_len, km);
3559 if (!err && save) {
3560 eb->map_token = *token;
3561 eb->kaddr = *map;
3562 eb->map_start = *map_start;
3563 eb->map_len = *map_len;
3564 }
3565 return err;
3566}
3567
3568void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3569{
3570 kunmap_atomic(token, km);
3571}
3572
3573int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 3581int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3574 unsigned long start, 3582 unsigned long start,
3575 unsigned long len) 3583 unsigned long len)
@@ -3593,9 +3601,8 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3593 3601
3594 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3602 cur = min(len, (PAGE_CACHE_SIZE - offset));
3595 3603
3596 kaddr = kmap_atomic(page, KM_USER0); 3604 kaddr = page_address(page);
3597 ret = memcmp(ptr, kaddr + offset, cur); 3605 ret = memcmp(ptr, kaddr + offset, cur);
3598 kunmap_atomic(kaddr, KM_USER0);
3599 if (ret) 3606 if (ret)
3600 break; 3607 break;
3601 3608
@@ -3628,9 +3635,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3628 WARN_ON(!PageUptodate(page)); 3635 WARN_ON(!PageUptodate(page));
3629 3636
3630 cur = min(len, PAGE_CACHE_SIZE - offset); 3637 cur = min(len, PAGE_CACHE_SIZE - offset);
3631 kaddr = kmap_atomic(page, KM_USER1); 3638 kaddr = page_address(page);
3632 memcpy(kaddr + offset, src, cur); 3639 memcpy(kaddr + offset, src, cur);
3633 kunmap_atomic(kaddr, KM_USER1);
3634 3640
3635 src += cur; 3641 src += cur;
3636 len -= cur; 3642 len -= cur;
@@ -3659,9 +3665,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
3659 WARN_ON(!PageUptodate(page)); 3665 WARN_ON(!PageUptodate(page));
3660 3666
3661 cur = min(len, PAGE_CACHE_SIZE - offset); 3667 cur = min(len, PAGE_CACHE_SIZE - offset);
3662 kaddr = kmap_atomic(page, KM_USER0); 3668 kaddr = page_address(page);
3663 memset(kaddr + offset, c, cur); 3669 memset(kaddr + offset, c, cur);
3664 kunmap_atomic(kaddr, KM_USER0);
3665 3670
3666 len -= cur; 3671 len -= cur;
3667 offset = 0; 3672 offset = 0;
@@ -3692,9 +3697,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3692 3697
3693 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 3698 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3694 3699
3695 kaddr = kmap_atomic(page, KM_USER0); 3700 kaddr = page_address(page);
3696 read_extent_buffer(src, kaddr + offset, src_offset, cur); 3701 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3697 kunmap_atomic(kaddr, KM_USER0);
3698 3702
3699 src_offset += cur; 3703 src_offset += cur;
3700 len -= cur; 3704 len -= cur;
@@ -3707,20 +3711,17 @@ static void move_pages(struct page *dst_page, struct page *src_page,
3707 unsigned long dst_off, unsigned long src_off, 3711 unsigned long dst_off, unsigned long src_off,
3708 unsigned long len) 3712 unsigned long len)
3709{ 3713{
3710 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3714 char *dst_kaddr = page_address(dst_page);
3711 if (dst_page == src_page) { 3715 if (dst_page == src_page) {
3712 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 3716 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3713 } else { 3717 } else {
3714 char *src_kaddr = kmap_atomic(src_page, KM_USER1); 3718 char *src_kaddr = page_address(src_page);
3715 char *p = dst_kaddr + dst_off + len; 3719 char *p = dst_kaddr + dst_off + len;
3716 char *s = src_kaddr + src_off + len; 3720 char *s = src_kaddr + src_off + len;
3717 3721
3718 while (len--) 3722 while (len--)
3719 *--p = *--s; 3723 *--p = *--s;
3720
3721 kunmap_atomic(src_kaddr, KM_USER1);
3722 } 3724 }
3723 kunmap_atomic(dst_kaddr, KM_USER0);
3724} 3725}
3725 3726
3726static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 3727static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
@@ -3733,20 +3734,17 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
3733 unsigned long dst_off, unsigned long src_off, 3734 unsigned long dst_off, unsigned long src_off,
3734 unsigned long len) 3735 unsigned long len)
3735{ 3736{
3736 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3737 char *dst_kaddr = page_address(dst_page);
3737 char *src_kaddr; 3738 char *src_kaddr;
3738 3739
3739 if (dst_page != src_page) { 3740 if (dst_page != src_page) {
3740 src_kaddr = kmap_atomic(src_page, KM_USER1); 3741 src_kaddr = page_address(src_page);
3741 } else { 3742 } else {
3742 src_kaddr = dst_kaddr; 3743 src_kaddr = dst_kaddr;
3743 BUG_ON(areas_overlap(src_off, dst_off, len)); 3744 BUG_ON(areas_overlap(src_off, dst_off, len));
3744 } 3745 }
3745 3746
3746 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3747 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3747 kunmap_atomic(dst_kaddr, KM_USER0);
3748 if (dst_page != src_page)
3749 kunmap_atomic(src_kaddr, KM_USER1);
3750} 3748}
3751 3749
3752void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3750void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a11a92ee2d3..21a7ca9e728 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -120,8 +120,6 @@ struct extent_state {
120struct extent_buffer { 120struct extent_buffer {
121 u64 start; 121 u64 start;
122 unsigned long len; 122 unsigned long len;
123 char *map_token;
124 char *kaddr;
125 unsigned long map_start; 123 unsigned long map_start;
126 unsigned long map_len; 124 unsigned long map_len;
127 struct page *first_page; 125 struct page *first_page;
@@ -130,14 +128,26 @@ struct extent_buffer {
130 struct rcu_head rcu_head; 128 struct rcu_head rcu_head;
131 atomic_t refs; 129 atomic_t refs;
132 130
133 /* the spinlock is used to protect most operations */ 131 /* count of read lock holders on the extent buffer */
134 spinlock_t lock; 132 atomic_t write_locks;
133 atomic_t read_locks;
134 atomic_t blocking_writers;
135 atomic_t blocking_readers;
136 atomic_t spinning_readers;
137 atomic_t spinning_writers;
138
139 /* protects write locks */
140 rwlock_t lock;
135 141
136 /* 142 /* readers use lock_wq while they wait for the write
137 * when we keep the lock held while blocking, waiters go onto 143 * lock holders to unlock
138 * the wq
139 */ 144 */
140 wait_queue_head_t lock_wq; 145 wait_queue_head_t write_lock_wq;
146
147 /* writers use read_lock_wq while they wait for readers
148 * to unlock
149 */
150 wait_queue_head_t read_lock_wq;
141}; 151};
142 152
143static inline void extent_set_compress_type(unsigned long *bio_flags, 153static inline void extent_set_compress_type(unsigned long *bio_flags,
@@ -279,15 +289,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
279int extent_buffer_uptodate(struct extent_io_tree *tree, 289int extent_buffer_uptodate(struct extent_io_tree *tree,
280 struct extent_buffer *eb, 290 struct extent_buffer *eb,
281 struct extent_state *cached_state); 291 struct extent_state *cached_state);
282int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
283 unsigned long min_len, char **token, char **map,
284 unsigned long *map_start,
285 unsigned long *map_len, int km);
286int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, 292int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
287 unsigned long min_len, char **token, char **map, 293 unsigned long min_len, char **map,
288 unsigned long *map_start, 294 unsigned long *map_start,
289 unsigned long *map_len, int km); 295 unsigned long *map_len);
290void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
291int extent_range_uptodate(struct extent_io_tree *tree, 296int extent_range_uptodate(struct extent_io_tree *tree,
292 u64 start, u64 end); 297 u64 start, u64 end);
293int extent_clear_unlock_delalloc(struct inode *inode, 298int extent_clear_unlock_delalloc(struct inode *inode,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 90d4ee52cd4..08bcfa92a22 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -177,6 +177,15 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
177 177
178 WARN_ON(bio->bi_vcnt <= 0); 178 WARN_ON(bio->bi_vcnt <= 0);
179 179
180 /*
181 * the free space stuff is only read when it hasn't been
182 * updated in the current transaction. So, we can safely
183 * read from the commit root and sidestep a nasty deadlock
184 * between reading the free space cache and updating the csum tree.
185 */
186 if (btrfs_is_free_space_inode(root, inode))
187 path->search_commit_root = 1;
188
180 disk_bytenr = (u64)bio->bi_sector << 9; 189 disk_bytenr = (u64)bio->bi_sector << 9;
181 if (dio) 190 if (dio)
182 offset = logical_offset; 191 offset = logical_offset;
@@ -664,10 +673,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
664 struct btrfs_sector_sum *sector_sum; 673 struct btrfs_sector_sum *sector_sum;
665 u32 nritems; 674 u32 nritems;
666 u32 ins_size; 675 u32 ins_size;
667 char *eb_map;
668 char *eb_token;
669 unsigned long map_len;
670 unsigned long map_start;
671 u16 csum_size = 676 u16 csum_size =
672 btrfs_super_csum_size(&root->fs_info->super_copy); 677 btrfs_super_csum_size(&root->fs_info->super_copy);
673 678
@@ -814,30 +819,9 @@ found:
814 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); 819 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
815 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 820 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
816 btrfs_item_size_nr(leaf, path->slots[0])); 821 btrfs_item_size_nr(leaf, path->slots[0]));
817 eb_token = NULL;
818next_sector: 822next_sector:
819 823
820 if (!eb_token || 824 write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size);
821 (unsigned long)item + csum_size >= map_start + map_len) {
822 int err;
823
824 if (eb_token)
825 unmap_extent_buffer(leaf, eb_token, KM_USER1);
826 eb_token = NULL;
827 err = map_private_extent_buffer(leaf, (unsigned long)item,
828 csum_size,
829 &eb_token, &eb_map,
830 &map_start, &map_len, KM_USER1);
831 if (err)
832 eb_token = NULL;
833 }
834 if (eb_token) {
835 memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
836 &sector_sum->sum, csum_size);
837 } else {
838 write_extent_buffer(leaf, &sector_sum->sum,
839 (unsigned long)item, csum_size);
840 }
841 825
842 total_bytes += root->sectorsize; 826 total_bytes += root->sectorsize;
843 sector_sum++; 827 sector_sum++;
@@ -850,10 +834,7 @@ next_sector:
850 goto next_sector; 834 goto next_sector;
851 } 835 }
852 } 836 }
853 if (eb_token) { 837
854 unmap_extent_buffer(leaf, eb_token, KM_USER1);
855 eb_token = NULL;
856 }
857 btrfs_mark_buffer_dirty(path->nodes[0]); 838 btrfs_mark_buffer_dirty(path->nodes[0]);
858 if (total_bytes < sums->len) { 839 if (total_bytes < sums->len) {
859 btrfs_release_path(path); 840 btrfs_release_path(path);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 59cbdb120ad..a35e51c9f23 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1081,7 +1081,8 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1081 1081
1082again: 1082again:
1083 for (i = 0; i < num_pages; i++) { 1083 for (i = 0; i < num_pages; i++) {
1084 pages[i] = grab_cache_page(inode->i_mapping, index + i); 1084 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1085 GFP_NOFS);
1085 if (!pages[i]) { 1086 if (!pages[i]) {
1086 faili = i - 1; 1087 faili = i - 1;
1087 err = -ENOMEM; 1088 err = -ENOMEM;
@@ -1238,9 +1239,11 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1238 * managed to copy. 1239 * managed to copy.
1239 */ 1240 */
1240 if (num_pages > dirty_pages) { 1241 if (num_pages > dirty_pages) {
1241 if (copied > 0) 1242 if (copied > 0) {
1242 atomic_inc( 1243 spin_lock(&BTRFS_I(inode)->lock);
1243 &BTRFS_I(inode)->outstanding_extents); 1244 BTRFS_I(inode)->outstanding_extents++;
1245 spin_unlock(&BTRFS_I(inode)->lock);
1246 }
1244 btrfs_delalloc_release_space(inode, 1247 btrfs_delalloc_release_space(inode,
1245 (num_pages - dirty_pages) << 1248 (num_pages - dirty_pages) <<
1246 PAGE_CACHE_SHIFT); 1249 PAGE_CACHE_SHIFT);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index bf0d61567f3..6377713f639 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -98,6 +98,12 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
98 return inode; 98 return inode;
99 99
100 spin_lock(&block_group->lock); 100 spin_lock(&block_group->lock);
101 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) {
102 printk(KERN_INFO "Old style space inode found, converting.\n");
103 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM;
104 block_group->disk_cache_state = BTRFS_DC_CLEAR;
105 }
106
101 if (!btrfs_fs_closing(root->fs_info)) { 107 if (!btrfs_fs_closing(root->fs_info)) {
102 block_group->inode = igrab(inode); 108 block_group->inode = igrab(inode);
103 block_group->iref = 1; 109 block_group->iref = 1;
@@ -135,7 +141,7 @@ int __create_free_space_inode(struct btrfs_root *root,
135 btrfs_set_inode_gid(leaf, inode_item, 0); 141 btrfs_set_inode_gid(leaf, inode_item, 0);
136 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 142 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
137 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 143 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
138 BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); 144 BTRFS_INODE_PREALLOC);
139 btrfs_set_inode_nlink(leaf, inode_item, 1); 145 btrfs_set_inode_nlink(leaf, inode_item, 1);
140 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 146 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
141 btrfs_set_inode_block_group(leaf, inode_item, offset); 147 btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -239,17 +245,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
239 struct btrfs_free_space_header *header; 245 struct btrfs_free_space_header *header;
240 struct extent_buffer *leaf; 246 struct extent_buffer *leaf;
241 struct page *page; 247 struct page *page;
242 u32 *checksums = NULL, *crc;
243 char *disk_crcs = NULL;
244 struct btrfs_key key; 248 struct btrfs_key key;
245 struct list_head bitmaps; 249 struct list_head bitmaps;
246 u64 num_entries; 250 u64 num_entries;
247 u64 num_bitmaps; 251 u64 num_bitmaps;
248 u64 generation; 252 u64 generation;
249 u32 cur_crc = ~(u32)0;
250 pgoff_t index = 0; 253 pgoff_t index = 0;
251 unsigned long first_page_offset;
252 int num_checksums;
253 int ret = 0; 254 int ret = 0;
254 255
255 INIT_LIST_HEAD(&bitmaps); 256 INIT_LIST_HEAD(&bitmaps);
@@ -292,16 +293,6 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
292 if (!num_entries) 293 if (!num_entries)
293 goto out; 294 goto out;
294 295
295 /* Setup everything for doing checksumming */
296 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
297 checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
298 if (!checksums)
299 goto out;
300 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
301 disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
302 if (!disk_crcs)
303 goto out;
304
305 ret = readahead_cache(inode); 296 ret = readahead_cache(inode);
306 if (ret) 297 if (ret)
307 goto out; 298 goto out;
@@ -311,18 +302,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
311 struct btrfs_free_space *e; 302 struct btrfs_free_space *e;
312 void *addr; 303 void *addr;
313 unsigned long offset = 0; 304 unsigned long offset = 0;
314 unsigned long start_offset = 0;
315 int need_loop = 0; 305 int need_loop = 0;
316 306
317 if (!num_entries && !num_bitmaps) 307 if (!num_entries && !num_bitmaps)
318 break; 308 break;
319 309
320 if (index == 0) { 310 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
321 start_offset = first_page_offset;
322 offset = start_offset;
323 }
324
325 page = grab_cache_page(inode->i_mapping, index);
326 if (!page) 311 if (!page)
327 goto free_cache; 312 goto free_cache;
328 313
@@ -342,8 +327,15 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
342 if (index == 0) { 327 if (index == 0) {
343 u64 *gen; 328 u64 *gen;
344 329
345 memcpy(disk_crcs, addr, first_page_offset); 330 /*
346 gen = addr + (sizeof(u32) * num_checksums); 331 * We put a bogus crc in the front of the first page in
332 * case old kernels try to mount a fs with the new
333 * format to make sure they discard the cache.
334 */
335 addr += sizeof(u64);
336 offset += sizeof(u64);
337
338 gen = addr;
347 if (*gen != BTRFS_I(inode)->generation) { 339 if (*gen != BTRFS_I(inode)->generation) {
348 printk(KERN_ERR "btrfs: space cache generation" 340 printk(KERN_ERR "btrfs: space cache generation"
349 " (%llu) does not match inode (%llu)\n", 341 " (%llu) does not match inode (%llu)\n",
@@ -355,24 +347,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
355 page_cache_release(page); 347 page_cache_release(page);
356 goto free_cache; 348 goto free_cache;
357 } 349 }
358 crc = (u32 *)disk_crcs; 350 addr += sizeof(u64);
359 } 351 offset += sizeof(u64);
360 entry = addr + start_offset;
361
362 /* First lets check our crc before we do anything fun */
363 cur_crc = ~(u32)0;
364 cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
365 PAGE_CACHE_SIZE - start_offset);
366 btrfs_csum_final(cur_crc, (char *)&cur_crc);
367 if (cur_crc != *crc) {
368 printk(KERN_ERR "btrfs: crc mismatch for page %lu\n",
369 index);
370 kunmap(page);
371 unlock_page(page);
372 page_cache_release(page);
373 goto free_cache;
374 } 352 }
375 crc++; 353 entry = addr;
376 354
377 while (1) { 355 while (1) {
378 if (!num_entries) 356 if (!num_entries)
@@ -470,8 +448,6 @@ next:
470 448
471 ret = 1; 449 ret = 1;
472out: 450out:
473 kfree(checksums);
474 kfree(disk_crcs);
475 return ret; 451 return ret;
476free_cache: 452free_cache:
477 __btrfs_remove_free_space_cache(ctl); 453 __btrfs_remove_free_space_cache(ctl);
@@ -569,8 +545,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
569 struct btrfs_key key; 545 struct btrfs_key key;
570 u64 start, end, len; 546 u64 start, end, len;
571 u64 bytes = 0; 547 u64 bytes = 0;
572 u32 *crc, *checksums; 548 u32 crc = ~(u32)0;
573 unsigned long first_page_offset;
574 int index = 0, num_pages = 0; 549 int index = 0, num_pages = 0;
575 int entries = 0; 550 int entries = 0;
576 int bitmaps = 0; 551 int bitmaps = 0;
@@ -590,34 +565,13 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
590 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 565 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
591 PAGE_CACHE_SHIFT; 566 PAGE_CACHE_SHIFT;
592 567
593 /* Since the first page has all of our checksums and our generation we
594 * need to calculate the offset into the page that we can start writing
595 * our entries.
596 */
597 first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
598
599 filemap_write_and_wait(inode->i_mapping); 568 filemap_write_and_wait(inode->i_mapping);
600 btrfs_wait_ordered_range(inode, inode->i_size & 569 btrfs_wait_ordered_range(inode, inode->i_size &
601 ~(root->sectorsize - 1), (u64)-1); 570 ~(root->sectorsize - 1), (u64)-1);
602 571
603 /* make sure we don't overflow that first page */
604 if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) {
605 /* this is really the same as running out of space, where we also return 0 */
606 printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n");
607 ret = 0;
608 goto out_update;
609 }
610
611 /* We need a checksum per page. */
612 crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
613 if (!crc)
614 return -1;
615
616 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); 572 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
617 if (!pages) { 573 if (!pages)
618 kfree(crc);
619 return -1; 574 return -1;
620 }
621 575
622 /* Get the cluster for this block_group if it exists */ 576 /* Get the cluster for this block_group if it exists */
623 if (block_group && !list_empty(&block_group->cluster_list)) 577 if (block_group && !list_empty(&block_group->cluster_list))
@@ -640,7 +594,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
640 * know and don't freak out. 594 * know and don't freak out.
641 */ 595 */
642 while (index < num_pages) { 596 while (index < num_pages) {
643 page = grab_cache_page(inode->i_mapping, index); 597 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
644 if (!page) { 598 if (!page) {
645 int i; 599 int i;
646 600
@@ -648,7 +602,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
648 unlock_page(pages[i]); 602 unlock_page(pages[i]);
649 page_cache_release(pages[i]); 603 page_cache_release(pages[i]);
650 } 604 }
651 goto out_free; 605 goto out;
652 } 606 }
653 pages[index] = page; 607 pages[index] = page;
654 index++; 608 index++;
@@ -668,17 +622,11 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
668 /* Write out the extent entries */ 622 /* Write out the extent entries */
669 do { 623 do {
670 struct btrfs_free_space_entry *entry; 624 struct btrfs_free_space_entry *entry;
671 void *addr; 625 void *addr, *orig;
672 unsigned long offset = 0; 626 unsigned long offset = 0;
673 unsigned long start_offset = 0;
674 627
675 next_page = false; 628 next_page = false;
676 629
677 if (index == 0) {
678 start_offset = first_page_offset;
679 offset = start_offset;
680 }
681
682 if (index >= num_pages) { 630 if (index >= num_pages) {
683 out_of_space = true; 631 out_of_space = true;
684 break; 632 break;
@@ -686,10 +634,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
686 634
687 page = pages[index]; 635 page = pages[index];
688 636
689 addr = kmap(page); 637 orig = addr = kmap(page);
690 entry = addr + start_offset; 638 if (index == 0) {
639 u64 *gen;
691 640
692 memset(addr, 0, PAGE_CACHE_SIZE); 641 /*
642 * We're going to put in a bogus crc for this page to
643 * make sure that old kernels who aren't aware of this
644 * format will be sure to discard the cache.
645 */
646 addr += sizeof(u64);
647 offset += sizeof(u64);
648
649 gen = addr;
650 *gen = trans->transid;
651 addr += sizeof(u64);
652 offset += sizeof(u64);
653 }
654 entry = addr;
655
656 memset(addr, 0, PAGE_CACHE_SIZE - offset);
693 while (node && !next_page) { 657 while (node && !next_page) {
694 struct btrfs_free_space *e; 658 struct btrfs_free_space *e;
695 659
@@ -752,13 +716,19 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
752 next_page = true; 716 next_page = true;
753 entry++; 717 entry++;
754 } 718 }
755 *crc = ~(u32)0;
756 *crc = btrfs_csum_data(root, addr + start_offset, *crc,
757 PAGE_CACHE_SIZE - start_offset);
758 kunmap(page);
759 719
760 btrfs_csum_final(*crc, (char *)crc); 720 /* Generate bogus crc value */
761 crc++; 721 if (index == 0) {
722 u32 *tmp;
723 crc = btrfs_csum_data(root, orig + sizeof(u64), crc,
724 PAGE_CACHE_SIZE - sizeof(u64));
725 btrfs_csum_final(crc, (char *)&crc);
726 crc++;
727 tmp = orig;
728 *tmp = crc;
729 }
730
731 kunmap(page);
762 732
763 bytes += PAGE_CACHE_SIZE; 733 bytes += PAGE_CACHE_SIZE;
764 734
@@ -779,11 +749,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
779 749
780 addr = kmap(page); 750 addr = kmap(page);
781 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); 751 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
782 *crc = ~(u32)0;
783 *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
784 kunmap(page); 752 kunmap(page);
785 btrfs_csum_final(*crc, (char *)crc);
786 crc++;
787 bytes += PAGE_CACHE_SIZE; 753 bytes += PAGE_CACHE_SIZE;
788 754
789 list_del_init(&entry->list); 755 list_del_init(&entry->list);
@@ -796,7 +762,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
796 i_size_read(inode) - 1, &cached_state, 762 i_size_read(inode) - 1, &cached_state,
797 GFP_NOFS); 763 GFP_NOFS);
798 ret = 0; 764 ret = 0;
799 goto out_free; 765 goto out;
800 } 766 }
801 767
802 /* Zero out the rest of the pages just to make sure */ 768 /* Zero out the rest of the pages just to make sure */
@@ -811,20 +777,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
811 index++; 777 index++;
812 } 778 }
813 779
814 /* Write the checksums and trans id to the first page */
815 {
816 void *addr;
817 u64 *gen;
818
819 page = pages[0];
820
821 addr = kmap(page);
822 memcpy(addr, checksums, sizeof(u32) * num_pages);
823 gen = addr + (sizeof(u32) * num_pages);
824 *gen = trans->transid;
825 kunmap(page);
826 }
827
828 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, 780 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
829 bytes, &cached_state); 781 bytes, &cached_state);
830 btrfs_drop_pages(pages, num_pages); 782 btrfs_drop_pages(pages, num_pages);
@@ -833,7 +785,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
833 785
834 if (ret) { 786 if (ret) {
835 ret = 0; 787 ret = 0;
836 goto out_free; 788 goto out;
837 } 789 }
838 790
839 BTRFS_I(inode)->generation = trans->transid; 791 BTRFS_I(inode)->generation = trans->transid;
@@ -850,7 +802,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
850 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 802 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
851 EXTENT_DIRTY | EXTENT_DELALLOC | 803 EXTENT_DIRTY | EXTENT_DELALLOC |
852 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); 804 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
853 goto out_free; 805 goto out;
854 } 806 }
855 leaf = path->nodes[0]; 807 leaf = path->nodes[0];
856 if (ret > 0) { 808 if (ret > 0) {
@@ -866,7 +818,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
866 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 818 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
867 GFP_NOFS); 819 GFP_NOFS);
868 btrfs_release_path(path); 820 btrfs_release_path(path);
869 goto out_free; 821 goto out;
870 } 822 }
871 } 823 }
872 header = btrfs_item_ptr(leaf, path->slots[0], 824 header = btrfs_item_ptr(leaf, path->slots[0],
@@ -879,11 +831,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
879 831
880 ret = 1; 832 ret = 1;
881 833
882out_free: 834out:
883 kfree(checksums);
884 kfree(pages); 835 kfree(pages);
885
886out_update:
887 if (ret != 1) { 836 if (ret != 1) {
888 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 837 invalidate_inode_pages2_range(inode->i_mapping, 0, index);
889 BTRFS_I(inode)->generation = 0; 838 BTRFS_I(inode)->generation = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index caa26ab5ed6..13e6255182e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -750,15 +750,6 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
750 return alloc_hint; 750 return alloc_hint;
751} 751}
752 752
753static inline bool is_free_space_inode(struct btrfs_root *root,
754 struct inode *inode)
755{
756 if (root == root->fs_info->tree_root ||
757 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
758 return true;
759 return false;
760}
761
762/* 753/*
763 * when extent_io.c finds a delayed allocation range in the file, 754 * when extent_io.c finds a delayed allocation range in the file,
764 * the call backs end up in this code. The basic idea is to 755 * the call backs end up in this code. The basic idea is to
@@ -791,7 +782,7 @@ static noinline int cow_file_range(struct inode *inode,
791 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 782 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
792 int ret = 0; 783 int ret = 0;
793 784
794 BUG_ON(is_free_space_inode(root, inode)); 785 BUG_ON(btrfs_is_free_space_inode(root, inode));
795 trans = btrfs_join_transaction(root); 786 trans = btrfs_join_transaction(root);
796 BUG_ON(IS_ERR(trans)); 787 BUG_ON(IS_ERR(trans));
797 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 788 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -1072,7 +1063,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1072 path = btrfs_alloc_path(); 1063 path = btrfs_alloc_path();
1073 BUG_ON(!path); 1064 BUG_ON(!path);
1074 1065
1075 nolock = is_free_space_inode(root, inode); 1066 nolock = btrfs_is_free_space_inode(root, inode);
1076 1067
1077 if (nolock) 1068 if (nolock)
1078 trans = btrfs_join_transaction_nolock(root); 1069 trans = btrfs_join_transaction_nolock(root);
@@ -1298,7 +1289,9 @@ static int btrfs_split_extent_hook(struct inode *inode,
1298 if (!(orig->state & EXTENT_DELALLOC)) 1289 if (!(orig->state & EXTENT_DELALLOC))
1299 return 0; 1290 return 0;
1300 1291
1301 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 1292 spin_lock(&BTRFS_I(inode)->lock);
1293 BTRFS_I(inode)->outstanding_extents++;
1294 spin_unlock(&BTRFS_I(inode)->lock);
1302 return 0; 1295 return 0;
1303} 1296}
1304 1297
@@ -1316,7 +1309,9 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1316 if (!(other->state & EXTENT_DELALLOC)) 1309 if (!(other->state & EXTENT_DELALLOC))
1317 return 0; 1310 return 0;
1318 1311
1319 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 1312 spin_lock(&BTRFS_I(inode)->lock);
1313 BTRFS_I(inode)->outstanding_extents--;
1314 spin_unlock(&BTRFS_I(inode)->lock);
1320 return 0; 1315 return 0;
1321} 1316}
1322 1317
@@ -1337,12 +1332,15 @@ static int btrfs_set_bit_hook(struct inode *inode,
1337 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1332 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1338 struct btrfs_root *root = BTRFS_I(inode)->root; 1333 struct btrfs_root *root = BTRFS_I(inode)->root;
1339 u64 len = state->end + 1 - state->start; 1334 u64 len = state->end + 1 - state->start;
1340 bool do_list = !is_free_space_inode(root, inode); 1335 bool do_list = !btrfs_is_free_space_inode(root, inode);
1341 1336
1342 if (*bits & EXTENT_FIRST_DELALLOC) 1337 if (*bits & EXTENT_FIRST_DELALLOC) {
1343 *bits &= ~EXTENT_FIRST_DELALLOC; 1338 *bits &= ~EXTENT_FIRST_DELALLOC;
1344 else 1339 } else {
1345 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 1340 spin_lock(&BTRFS_I(inode)->lock);
1341 BTRFS_I(inode)->outstanding_extents++;
1342 spin_unlock(&BTRFS_I(inode)->lock);
1343 }
1346 1344
1347 spin_lock(&root->fs_info->delalloc_lock); 1345 spin_lock(&root->fs_info->delalloc_lock);
1348 BTRFS_I(inode)->delalloc_bytes += len; 1346 BTRFS_I(inode)->delalloc_bytes += len;
@@ -1370,12 +1368,15 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1370 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1368 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1371 struct btrfs_root *root = BTRFS_I(inode)->root; 1369 struct btrfs_root *root = BTRFS_I(inode)->root;
1372 u64 len = state->end + 1 - state->start; 1370 u64 len = state->end + 1 - state->start;
1373 bool do_list = !is_free_space_inode(root, inode); 1371 bool do_list = !btrfs_is_free_space_inode(root, inode);
1374 1372
1375 if (*bits & EXTENT_FIRST_DELALLOC) 1373 if (*bits & EXTENT_FIRST_DELALLOC) {
1376 *bits &= ~EXTENT_FIRST_DELALLOC; 1374 *bits &= ~EXTENT_FIRST_DELALLOC;
1377 else if (!(*bits & EXTENT_DO_ACCOUNTING)) 1375 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1378 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 1376 spin_lock(&BTRFS_I(inode)->lock);
1377 BTRFS_I(inode)->outstanding_extents--;
1378 spin_unlock(&BTRFS_I(inode)->lock);
1379 }
1379 1380
1380 if (*bits & EXTENT_DO_ACCOUNTING) 1381 if (*bits & EXTENT_DO_ACCOUNTING)
1381 btrfs_delalloc_release_metadata(inode, len); 1382 btrfs_delalloc_release_metadata(inode, len);
@@ -1477,7 +1478,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1477 1478
1478 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1479 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1479 1480
1480 if (is_free_space_inode(root, inode)) 1481 if (btrfs_is_free_space_inode(root, inode))
1481 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); 1482 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
1482 else 1483 else
1483 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1484 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
@@ -1726,7 +1727,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1726 return 0; 1727 return 0;
1727 BUG_ON(!ordered_extent); 1728 BUG_ON(!ordered_extent);
1728 1729
1729 nolock = is_free_space_inode(root, inode); 1730 nolock = btrfs_is_free_space_inode(root, inode);
1730 1731
1731 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1732 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1732 BUG_ON(!list_empty(&ordered_extent->list)); 1733 BUG_ON(!list_empty(&ordered_extent->list));
@@ -2531,13 +2532,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
2531 2532
2532 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2533 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2533 struct btrfs_inode_item); 2534 struct btrfs_inode_item);
2534 if (!leaf->map_token)
2535 map_private_extent_buffer(leaf, (unsigned long)inode_item,
2536 sizeof(struct btrfs_inode_item),
2537 &leaf->map_token, &leaf->kaddr,
2538 &leaf->map_start, &leaf->map_len,
2539 KM_USER1);
2540
2541 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2535 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
2542 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); 2536 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
2543 inode->i_uid = btrfs_inode_uid(leaf, inode_item); 2537 inode->i_uid = btrfs_inode_uid(leaf, inode_item);
@@ -2575,11 +2569,6 @@ cache_acl:
2575 if (!maybe_acls) 2569 if (!maybe_acls)
2576 cache_no_acl(inode); 2570 cache_no_acl(inode);
2577 2571
2578 if (leaf->map_token) {
2579 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2580 leaf->map_token = NULL;
2581 }
2582
2583 btrfs_free_path(path); 2572 btrfs_free_path(path);
2584 2573
2585 switch (inode->i_mode & S_IFMT) { 2574 switch (inode->i_mode & S_IFMT) {
@@ -2624,13 +2613,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2624 struct btrfs_inode_item *item, 2613 struct btrfs_inode_item *item,
2625 struct inode *inode) 2614 struct inode *inode)
2626{ 2615{
2627 if (!leaf->map_token)
2628 map_private_extent_buffer(leaf, (unsigned long)item,
2629 sizeof(struct btrfs_inode_item),
2630 &leaf->map_token, &leaf->kaddr,
2631 &leaf->map_start, &leaf->map_len,
2632 KM_USER1);
2633
2634 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2616 btrfs_set_inode_uid(leaf, item, inode->i_uid);
2635 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2617 btrfs_set_inode_gid(leaf, item, inode->i_gid);
2636 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2618 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
@@ -2659,11 +2641,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2659 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2641 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2660 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2642 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2661 btrfs_set_inode_block_group(leaf, item, 0); 2643 btrfs_set_inode_block_group(leaf, item, 0);
2662
2663 if (leaf->map_token) {
2664 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2665 leaf->map_token = NULL;
2666 }
2667} 2644}
2668 2645
2669/* 2646/*
@@ -2684,7 +2661,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2684 * The data relocation inode should also be directly updated 2661 * The data relocation inode should also be directly updated
2685 * without delay 2662 * without delay
2686 */ 2663 */
2687 if (!is_free_space_inode(root, inode) 2664 if (!btrfs_is_free_space_inode(root, inode)
2688 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 2665 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2689 ret = btrfs_delayed_update_inode(trans, root, inode); 2666 ret = btrfs_delayed_update_inode(trans, root, inode);
2690 if (!ret) 2667 if (!ret)
@@ -3398,7 +3375,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3398 3375
3399 ret = -ENOMEM; 3376 ret = -ENOMEM;
3400again: 3377again:
3401 page = grab_cache_page(mapping, index); 3378 page = find_or_create_page(mapping, index, GFP_NOFS);
3402 if (!page) { 3379 if (!page) {
3403 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3380 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3404 goto out; 3381 goto out;
@@ -3634,7 +3611,7 @@ void btrfs_evict_inode(struct inode *inode)
3634 3611
3635 truncate_inode_pages(&inode->i_data, 0); 3612 truncate_inode_pages(&inode->i_data, 0);
3636 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3613 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3637 is_free_space_inode(root, inode))) 3614 btrfs_is_free_space_inode(root, inode)))
3638 goto no_delete; 3615 goto no_delete;
3639 3616
3640 if (is_bad_inode(inode)) { 3617 if (is_bad_inode(inode)) {
@@ -4271,7 +4248,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4271 if (BTRFS_I(inode)->dummy_inode) 4248 if (BTRFS_I(inode)->dummy_inode)
4272 return 0; 4249 return 0;
4273 4250
4274 if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode)) 4251 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
4275 nolock = true; 4252 nolock = true;
4276 4253
4277 if (wbc->sync_mode == WB_SYNC_ALL) { 4254 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -6728,8 +6705,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6728 ei->index_cnt = (u64)-1; 6705 ei->index_cnt = (u64)-1;
6729 ei->last_unlink_trans = 0; 6706 ei->last_unlink_trans = 0;
6730 6707
6731 atomic_set(&ei->outstanding_extents, 0); 6708 spin_lock_init(&ei->lock);
6732 atomic_set(&ei->reserved_extents, 0); 6709 ei->outstanding_extents = 0;
6710 ei->reserved_extents = 0;
6733 6711
6734 ei->ordered_data_close = 0; 6712 ei->ordered_data_close = 0;
6735 ei->orphan_meta_reserved = 0; 6713 ei->orphan_meta_reserved = 0;
@@ -6767,8 +6745,8 @@ void btrfs_destroy_inode(struct inode *inode)
6767 6745
6768 WARN_ON(!list_empty(&inode->i_dentry)); 6746 WARN_ON(!list_empty(&inode->i_dentry));
6769 WARN_ON(inode->i_data.nrpages); 6747 WARN_ON(inode->i_data.nrpages);
6770 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); 6748 WARN_ON(BTRFS_I(inode)->outstanding_extents);
6771 WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents)); 6749 WARN_ON(BTRFS_I(inode)->reserved_extents);
6772 6750
6773 /* 6751 /*
6774 * This can happen where we create an inode, but somebody else also 6752 * This can happen where we create an inode, but somebody else also
@@ -6823,7 +6801,7 @@ int btrfs_drop_inode(struct inode *inode)
6823 struct btrfs_root *root = BTRFS_I(inode)->root; 6801 struct btrfs_root *root = BTRFS_I(inode)->root;
6824 6802
6825 if (btrfs_root_refs(&root->root_item) == 0 && 6803 if (btrfs_root_refs(&root->root_item) == 0 &&
6826 !is_free_space_inode(root, inode)) 6804 !btrfs_is_free_space_inode(root, inode))
6827 return 1; 6805 return 1;
6828 else 6806 else
6829 return generic_drop_inode(inode); 6807 return generic_drop_inode(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 622543309eb..0b980afc5ed 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -859,8 +859,8 @@ again:
859 /* step one, lock all the pages */ 859 /* step one, lock all the pages */
860 for (i = 0; i < num_pages; i++) { 860 for (i = 0; i < num_pages; i++) {
861 struct page *page; 861 struct page *page;
862 page = grab_cache_page(inode->i_mapping, 862 page = find_or_create_page(inode->i_mapping,
863 start_index + i); 863 start_index + i, GFP_NOFS);
864 if (!page) 864 if (!page)
865 break; 865 break;
866 866
@@ -930,7 +930,9 @@ again:
930 GFP_NOFS); 930 GFP_NOFS);
931 931
932 if (i_done != num_pages) { 932 if (i_done != num_pages) {
933 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 933 spin_lock(&BTRFS_I(inode)->lock);
934 BTRFS_I(inode)->outstanding_extents++;
935 spin_unlock(&BTRFS_I(inode)->lock);
934 btrfs_delalloc_release_space(inode, 936 btrfs_delalloc_release_space(inode,
935 (num_pages - i_done) << PAGE_CACHE_SHIFT); 937 (num_pages - i_done) << PAGE_CACHE_SHIFT);
936 } 938 }
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 66fa43dc3f0..d77b67c4b27 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -24,185 +24,197 @@
24#include "extent_io.h" 24#include "extent_io.h"
25#include "locking.h" 25#include "locking.h"
26 26
27static inline void spin_nested(struct extent_buffer *eb) 27void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
28{
29 spin_lock(&eb->lock);
30}
31 28
32/* 29/*
33 * Setting a lock to blocking will drop the spinlock and set the 30 * if we currently have a spinning reader or writer lock
34 * flag that forces other procs who want the lock to wait. After 31 * (indicated by the rw flag) this will bump the count
35 * this you can safely schedule with the lock held. 32 * of blocking holders and drop the spinlock.
36 */ 33 */
37void btrfs_set_lock_blocking(struct extent_buffer *eb) 34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
38{ 35{
39 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 36 if (rw == BTRFS_WRITE_LOCK) {
40 set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 37 if (atomic_read(&eb->blocking_writers) == 0) {
41 spin_unlock(&eb->lock); 38 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
39 atomic_dec(&eb->spinning_writers);
40 btrfs_assert_tree_locked(eb);
41 atomic_inc(&eb->blocking_writers);
42 write_unlock(&eb->lock);
43 }
44 } else if (rw == BTRFS_READ_LOCK) {
45 btrfs_assert_tree_read_locked(eb);
46 atomic_inc(&eb->blocking_readers);
47 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
48 atomic_dec(&eb->spinning_readers);
49 read_unlock(&eb->lock);
42 } 50 }
43 /* exit with the spin lock released and the bit set */ 51 return;
44} 52}
45 53
46/* 54/*
47 * clearing the blocking flag will take the spinlock again. 55 * if we currently have a blocking lock, take the spinlock
48 * After this you can't safely schedule 56 * and drop our blocking count
49 */ 57 */
50void btrfs_clear_lock_blocking(struct extent_buffer *eb) 58void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
51{ 59{
52 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 60 if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
53 spin_nested(eb); 61 BUG_ON(atomic_read(&eb->blocking_writers) != 1);
54 clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 62 write_lock(&eb->lock);
55 smp_mb__after_clear_bit(); 63 WARN_ON(atomic_read(&eb->spinning_writers));
64 atomic_inc(&eb->spinning_writers);
65 if (atomic_dec_and_test(&eb->blocking_writers))
66 wake_up(&eb->write_lock_wq);
67 } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
68 BUG_ON(atomic_read(&eb->blocking_readers) == 0);
69 read_lock(&eb->lock);
70 atomic_inc(&eb->spinning_readers);
71 if (atomic_dec_and_test(&eb->blocking_readers))
72 wake_up(&eb->read_lock_wq);
56 } 73 }
57 /* exit with the spin lock held */ 74 return;
58} 75}
59 76
60/* 77/*
61 * unfortunately, many of the places that currently set a lock to blocking 78 * take a spinning read lock. This will wait for any blocking
62 * don't end up blocking for very long, and often they don't block 79 * writers
63 * at all. For a dbench 50 run, if we don't spin on the blocking bit
64 * at all, the context switch rate can jump up to 400,000/sec or more.
65 *
66 * So, we're still stuck with this crummy spin on the blocking bit,
67 * at least until the most common causes of the short blocks
68 * can be dealt with.
69 */ 80 */
70static int btrfs_spin_on_block(struct extent_buffer *eb) 81void btrfs_tree_read_lock(struct extent_buffer *eb)
71{ 82{
72 int i; 83again:
73 84 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
74 for (i = 0; i < 512; i++) { 85 read_lock(&eb->lock);
75 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 86 if (atomic_read(&eb->blocking_writers)) {
76 return 1; 87 read_unlock(&eb->lock);
77 if (need_resched()) 88 wait_event(eb->write_lock_wq,
78 break; 89 atomic_read(&eb->blocking_writers) == 0);
79 cpu_relax(); 90 goto again;
80 } 91 }
81 return 0; 92 atomic_inc(&eb->read_locks);
93 atomic_inc(&eb->spinning_readers);
82} 94}
83 95
84/* 96/*
85 * This is somewhat different from trylock. It will take the 97 * returns 1 if we get the read lock and 0 if we don't
86 * spinlock but if it finds the lock is set to blocking, it will 98 * this won't wait for blocking writers
87 * return without the lock held.
88 *
89 * returns 1 if it was able to take the lock and zero otherwise
90 *
91 * After this call, scheduling is not safe without first calling
92 * btrfs_set_lock_blocking()
93 */ 99 */
94int btrfs_try_spin_lock(struct extent_buffer *eb) 100int btrfs_try_tree_read_lock(struct extent_buffer *eb)
95{ 101{
96 int i; 102 if (atomic_read(&eb->blocking_writers))
103 return 0;
97 104
98 if (btrfs_spin_on_block(eb)) { 105 read_lock(&eb->lock);
99 spin_nested(eb); 106 if (atomic_read(&eb->blocking_writers)) {
100 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 107 read_unlock(&eb->lock);
101 return 1; 108 return 0;
102 spin_unlock(&eb->lock);
103 } 109 }
104 /* spin for a bit on the BLOCKING flag */ 110 atomic_inc(&eb->read_locks);
105 for (i = 0; i < 2; i++) { 111 atomic_inc(&eb->spinning_readers);
106 cpu_relax(); 112 return 1;
107 if (!btrfs_spin_on_block(eb))
108 break;
109
110 spin_nested(eb);
111 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
112 return 1;
113 spin_unlock(&eb->lock);
114 }
115 return 0;
116} 113}
117 114
118/* 115/*
119 * the autoremove wake function will return 0 if it tried to wake up 116 * returns 1 if we get the read lock and 0 if we don't
120 * a process that was already awake, which means that process won't 117 * this won't wait for blocking writers or readers
121 * count as an exclusive wakeup. The waitq code will continue waking
122 * procs until it finds one that was actually sleeping.
123 *
124 * For btrfs, this isn't quite what we want. We want a single proc
125 * to be notified that the lock is ready for taking. If that proc
126 * already happen to be awake, great, it will loop around and try for
127 * the lock.
128 *
129 * So, btrfs_wake_function always returns 1, even when the proc that we
130 * tried to wake up was already awake.
131 */ 118 */
132static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, 119int btrfs_try_tree_write_lock(struct extent_buffer *eb)
133 int sync, void *key)
134{ 120{
135 autoremove_wake_function(wait, mode, sync, key); 121 if (atomic_read(&eb->blocking_writers) ||
122 atomic_read(&eb->blocking_readers))
123 return 0;
124 write_lock(&eb->lock);
125 if (atomic_read(&eb->blocking_writers) ||
126 atomic_read(&eb->blocking_readers)) {
127 write_unlock(&eb->lock);
128 return 0;
129 }
130 atomic_inc(&eb->write_locks);
131 atomic_inc(&eb->spinning_writers);
136 return 1; 132 return 1;
137} 133}
138 134
139/* 135/*
140 * returns with the extent buffer spinlocked. 136 * drop a spinning read lock
141 * 137 */
142 * This will spin and/or wait as required to take the lock, and then 138void btrfs_tree_read_unlock(struct extent_buffer *eb)
143 * return with the spinlock held. 139{
144 * 140 btrfs_assert_tree_read_locked(eb);
145 * After this call, scheduling is not safe without first calling 141 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
146 * btrfs_set_lock_blocking() 142 atomic_dec(&eb->spinning_readers);
143 atomic_dec(&eb->read_locks);
144 read_unlock(&eb->lock);
145}
146
147/*
148 * drop a blocking read lock
149 */
150void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
151{
152 btrfs_assert_tree_read_locked(eb);
153 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
154 if (atomic_dec_and_test(&eb->blocking_readers))
155 wake_up(&eb->read_lock_wq);
156 atomic_dec(&eb->read_locks);
157}
158
159/*
160 * take a spinning write lock. This will wait for both
161 * blocking readers or writers
147 */ 162 */
148int btrfs_tree_lock(struct extent_buffer *eb) 163int btrfs_tree_lock(struct extent_buffer *eb)
149{ 164{
150 DEFINE_WAIT(wait); 165again:
151 wait.func = btrfs_wake_function; 166 wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
152 167 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
153 if (!btrfs_spin_on_block(eb)) 168 write_lock(&eb->lock);
154 goto sleep; 169 if (atomic_read(&eb->blocking_readers)) {
155 170 write_unlock(&eb->lock);
156 while(1) { 171 wait_event(eb->read_lock_wq,
157 spin_nested(eb); 172 atomic_read(&eb->blocking_readers) == 0);
158 173 goto again;
159 /* nobody is blocking, exit with the spinlock held */
160 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
161 return 0;
162
163 /*
164 * we have the spinlock, but the real owner is blocking.
165 * wait for them
166 */
167 spin_unlock(&eb->lock);
168
169 /*
170 * spin for a bit, and if the blocking flag goes away,
171 * loop around
172 */
173 cpu_relax();
174 if (btrfs_spin_on_block(eb))
175 continue;
176sleep:
177 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
178 TASK_UNINTERRUPTIBLE);
179
180 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
181 schedule();
182
183 finish_wait(&eb->lock_wq, &wait);
184 } 174 }
175 if (atomic_read(&eb->blocking_writers)) {
176 write_unlock(&eb->lock);
177 wait_event(eb->write_lock_wq,
178 atomic_read(&eb->blocking_writers) == 0);
179 goto again;
180 }
181 WARN_ON(atomic_read(&eb->spinning_writers));
182 atomic_inc(&eb->spinning_writers);
183 atomic_inc(&eb->write_locks);
185 return 0; 184 return 0;
186} 185}
187 186
187/*
188 * drop a spinning or a blocking write lock.
189 */
188int btrfs_tree_unlock(struct extent_buffer *eb) 190int btrfs_tree_unlock(struct extent_buffer *eb)
189{ 191{
190 /* 192 int blockers = atomic_read(&eb->blocking_writers);
191 * if we were a blocking owner, we don't have the spinlock held 193
192 * just clear the bit and look for waiters 194 BUG_ON(blockers > 1);
193 */ 195
194 if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 196 btrfs_assert_tree_locked(eb);
195 smp_mb__after_clear_bit(); 197 atomic_dec(&eb->write_locks);
196 else 198
197 spin_unlock(&eb->lock); 199 if (blockers) {
198 200 WARN_ON(atomic_read(&eb->spinning_writers));
199 if (waitqueue_active(&eb->lock_wq)) 201 atomic_dec(&eb->blocking_writers);
200 wake_up(&eb->lock_wq); 202 smp_wmb();
203 wake_up(&eb->write_lock_wq);
204 } else {
205 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
206 atomic_dec(&eb->spinning_writers);
207 write_unlock(&eb->lock);
208 }
201 return 0; 209 return 0;
202} 210}
203 211
204void btrfs_assert_tree_locked(struct extent_buffer *eb) 212void btrfs_assert_tree_locked(struct extent_buffer *eb)
205{ 213{
206 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 214 BUG_ON(!atomic_read(&eb->write_locks));
207 assert_spin_locked(&eb->lock); 215}
216
217void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
218{
219 BUG_ON(!atomic_read(&eb->read_locks));
208} 220}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 5c33a560a2f..17247ddb81a 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -19,11 +19,43 @@
19#ifndef __BTRFS_LOCKING_ 19#ifndef __BTRFS_LOCKING_
20#define __BTRFS_LOCKING_ 20#define __BTRFS_LOCKING_
21 21
22#define BTRFS_WRITE_LOCK 1
23#define BTRFS_READ_LOCK 2
24#define BTRFS_WRITE_LOCK_BLOCKING 3
25#define BTRFS_READ_LOCK_BLOCKING 4
26
22int btrfs_tree_lock(struct extent_buffer *eb); 27int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 28int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_try_spin_lock(struct extent_buffer *eb); 29int btrfs_try_spin_lock(struct extent_buffer *eb);
25 30
26void btrfs_set_lock_blocking(struct extent_buffer *eb); 31void btrfs_tree_read_lock(struct extent_buffer *eb);
27void btrfs_clear_lock_blocking(struct extent_buffer *eb); 32void btrfs_tree_read_unlock(struct extent_buffer *eb);
33void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
35void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
28void btrfs_assert_tree_locked(struct extent_buffer *eb); 36void btrfs_assert_tree_locked(struct extent_buffer *eb);
37int btrfs_try_tree_read_lock(struct extent_buffer *eb);
38int btrfs_try_tree_write_lock(struct extent_buffer *eb);
39
40static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
41{
42 if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING)
43 btrfs_tree_unlock(eb);
44 else if (rw == BTRFS_READ_LOCK_BLOCKING)
45 btrfs_tree_read_unlock_blocking(eb);
46 else if (rw == BTRFS_READ_LOCK)
47 btrfs_tree_read_unlock(eb);
48 else
49 BUG();
50}
51
52static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
53{
54 btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
55}
56
57static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
58{
59 btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
60}
29#endif 61#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 5e0a3dc79a4..59bb1764273 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2955,7 +2955,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
2955 page_cache_sync_readahead(inode->i_mapping, 2955 page_cache_sync_readahead(inode->i_mapping,
2956 ra, NULL, index, 2956 ra, NULL, index,
2957 last_index + 1 - index); 2957 last_index + 1 - index);
2958 page = grab_cache_page(inode->i_mapping, index); 2958 page = find_or_create_page(inode->i_mapping, index,
2959 GFP_NOFS);
2959 if (!page) { 2960 if (!page) {
2960 btrfs_delalloc_release_metadata(inode, 2961 btrfs_delalloc_release_metadata(inode,
2961 PAGE_CACHE_SIZE); 2962 PAGE_CACHE_SIZE);
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index c0f7ecaf1e7..bc1f6ad1844 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -50,36 +50,22 @@ u##bits btrfs_##name(struct extent_buffer *eb, \
50 unsigned long part_offset = (unsigned long)s; \ 50 unsigned long part_offset = (unsigned long)s; \
51 unsigned long offset = part_offset + offsetof(type, member); \ 51 unsigned long offset = part_offset + offsetof(type, member); \
52 type *p; \ 52 type *p; \
53 /* ugly, but we want the fast path here */ \ 53 int err; \
54 if (eb->map_token && offset >= eb->map_start && \ 54 char *kaddr; \
55 offset + sizeof(((type *)0)->member) <= eb->map_start + \ 55 unsigned long map_start; \
56 eb->map_len) { \ 56 unsigned long map_len; \
57 p = (type *)(eb->kaddr + part_offset - eb->map_start); \ 57 u##bits res; \
58 return le##bits##_to_cpu(p->member); \ 58 err = map_private_extent_buffer(eb, offset, \
59 } \ 59 sizeof(((type *)0)->member), \
60 { \ 60 &kaddr, &map_start, &map_len); \
61 int err; \ 61 if (err) { \
62 char *map_token; \ 62 __le##bits leres; \
63 char *kaddr; \ 63 read_eb_member(eb, s, type, member, &leres); \
64 int unmap_on_exit = (eb->map_token == NULL); \ 64 return le##bits##_to_cpu(leres); \
65 unsigned long map_start; \ 65 } \
66 unsigned long map_len; \ 66 p = (type *)(kaddr + part_offset - map_start); \
67 u##bits res; \ 67 res = le##bits##_to_cpu(p->member); \
68 err = map_extent_buffer(eb, offset, \ 68 return res; \
69 sizeof(((type *)0)->member), \
70 &map_token, &kaddr, \
71 &map_start, &map_len, KM_USER1); \
72 if (err) { \
73 __le##bits leres; \
74 read_eb_member(eb, s, type, member, &leres); \
75 return le##bits##_to_cpu(leres); \
76 } \
77 p = (type *)(kaddr + part_offset - map_start); \
78 res = le##bits##_to_cpu(p->member); \
79 if (unmap_on_exit) \
80 unmap_extent_buffer(eb, map_token, KM_USER1); \
81 return res; \
82 } \
83} \ 69} \
84void btrfs_set_##name(struct extent_buffer *eb, \ 70void btrfs_set_##name(struct extent_buffer *eb, \
85 type *s, u##bits val) \ 71 type *s, u##bits val) \
@@ -87,36 +73,21 @@ void btrfs_set_##name(struct extent_buffer *eb, \
87 unsigned long part_offset = (unsigned long)s; \ 73 unsigned long part_offset = (unsigned long)s; \
88 unsigned long offset = part_offset + offsetof(type, member); \ 74 unsigned long offset = part_offset + offsetof(type, member); \
89 type *p; \ 75 type *p; \
90 /* ugly, but we want the fast path here */ \ 76 int err; \
91 if (eb->map_token && offset >= eb->map_start && \ 77 char *kaddr; \
92 offset + sizeof(((type *)0)->member) <= eb->map_start + \ 78 unsigned long map_start; \
93 eb->map_len) { \ 79 unsigned long map_len; \
94 p = (type *)(eb->kaddr + part_offset - eb->map_start); \ 80 err = map_private_extent_buffer(eb, offset, \
95 p->member = cpu_to_le##bits(val); \ 81 sizeof(((type *)0)->member), \
96 return; \ 82 &kaddr, &map_start, &map_len); \
97 } \ 83 if (err) { \
98 { \ 84 __le##bits val2; \
99 int err; \ 85 val2 = cpu_to_le##bits(val); \
100 char *map_token; \ 86 write_eb_member(eb, s, type, member, &val2); \
101 char *kaddr; \ 87 return; \
102 int unmap_on_exit = (eb->map_token == NULL); \ 88 } \
103 unsigned long map_start; \ 89 p = (type *)(kaddr + part_offset - map_start); \
104 unsigned long map_len; \ 90 p->member = cpu_to_le##bits(val); \
105 err = map_extent_buffer(eb, offset, \
106 sizeof(((type *)0)->member), \
107 &map_token, &kaddr, \
108 &map_start, &map_len, KM_USER1); \
109 if (err) { \
110 __le##bits val2; \
111 val2 = cpu_to_le##bits(val); \
112 write_eb_member(eb, s, type, member, &val2); \
113 return; \
114 } \
115 p = (type *)(kaddr + part_offset - map_start); \
116 p->member = cpu_to_le##bits(val); \
117 if (unmap_on_exit) \
118 unmap_extent_buffer(eb, map_token, KM_USER1); \
119 } \
120} 91}
121 92
122#include "ctree.h" 93#include "ctree.h"
@@ -125,15 +96,6 @@ void btrfs_node_key(struct extent_buffer *eb,
125 struct btrfs_disk_key *disk_key, int nr) 96 struct btrfs_disk_key *disk_key, int nr)
126{ 97{
127 unsigned long ptr = btrfs_node_key_ptr_offset(nr); 98 unsigned long ptr = btrfs_node_key_ptr_offset(nr);
128 if (eb->map_token && ptr >= eb->map_start &&
129 ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
130 memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
131 sizeof(*disk_key));
132 return;
133 } else if (eb->map_token) {
134 unmap_extent_buffer(eb, eb->map_token, KM_USER1);
135 eb->map_token = NULL;
136 }
137 read_eb_member(eb, (struct btrfs_key_ptr *)ptr, 99 read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
138 struct btrfs_key_ptr, key, disk_key); 100 struct btrfs_key_ptr, key, disk_key);
139} 101}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 51dcec86757..eb55863bb4a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -260,7 +260,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
260{ 260{
261 struct btrfs_trans_handle *h; 261 struct btrfs_trans_handle *h;
262 struct btrfs_transaction *cur_trans; 262 struct btrfs_transaction *cur_trans;
263 int retries = 0; 263 u64 num_bytes = 0;
264 int ret; 264 int ret;
265 265
266 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 266 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
@@ -274,6 +274,19 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
274 h->block_rsv = NULL; 274 h->block_rsv = NULL;
275 goto got_it; 275 goto got_it;
276 } 276 }
277
278 /*
279 * Do the reservation before we join the transaction so we can do all
280 * the appropriate flushing if need be.
281 */
282 if (num_items > 0 && root != root->fs_info->chunk_root) {
283 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
284 ret = btrfs_block_rsv_add(NULL, root,
285 &root->fs_info->trans_block_rsv,
286 num_bytes);
287 if (ret)
288 return ERR_PTR(ret);
289 }
277again: 290again:
278 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 291 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
279 if (!h) 292 if (!h)
@@ -310,24 +323,9 @@ again:
310 goto again; 323 goto again;
311 } 324 }
312 325
313 if (num_items > 0) { 326 if (num_bytes) {
314 ret = btrfs_trans_reserve_metadata(h, root, num_items); 327 h->block_rsv = &root->fs_info->trans_block_rsv;
315 if (ret == -EAGAIN && !retries) { 328 h->bytes_reserved = num_bytes;
316 retries++;
317 btrfs_commit_transaction(h, root);
318 goto again;
319 } else if (ret == -EAGAIN) {
320 /*
321 * We have already retried and got EAGAIN, so really we
322 * don't have space, so set ret to -ENOSPC.
323 */
324 ret = -ENOSPC;
325 }
326
327 if (ret < 0) {
328 btrfs_end_transaction(h, root);
329 return ERR_PTR(ret);
330 }
331 } 329 }
332 330
333got_it: 331got_it:
@@ -499,10 +497,17 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
499 } 497 }
500 498
501 if (lock && cur_trans->blocked && !cur_trans->in_commit) { 499 if (lock && cur_trans->blocked && !cur_trans->in_commit) {
502 if (throttle) 500 if (throttle) {
501 /*
502 * We may race with somebody else here so end up having
503 * to call end_transaction on ourselves again, so inc
504 * our use_count.
505 */
506 trans->use_count++;
503 return btrfs_commit_transaction(trans, root); 507 return btrfs_commit_transaction(trans, root);
504 else 508 } else {
505 wake_up_process(info->transaction_kthread); 509 wake_up_process(info->transaction_kthread);
510 }
506 } 511 }
507 512
508 WARN_ON(cur_trans != info->running_transaction); 513 WARN_ON(cur_trans != info->running_transaction);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4ce8a9f41d1..ac278dd8317 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1730,8 +1730,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1730 btrfs_read_buffer(next, ptr_gen); 1730 btrfs_read_buffer(next, ptr_gen);
1731 1731
1732 btrfs_tree_lock(next); 1732 btrfs_tree_lock(next);
1733 clean_tree_block(trans, root, next);
1734 btrfs_set_lock_blocking(next); 1733 btrfs_set_lock_blocking(next);
1734 clean_tree_block(trans, root, next);
1735 btrfs_wait_tree_block_writeback(next); 1735 btrfs_wait_tree_block_writeback(next);
1736 btrfs_tree_unlock(next); 1736 btrfs_tree_unlock(next);
1737 1737
@@ -1796,8 +1796,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1796 next = path->nodes[*level]; 1796 next = path->nodes[*level];
1797 1797
1798 btrfs_tree_lock(next); 1798 btrfs_tree_lock(next);
1799 clean_tree_block(trans, root, next);
1800 btrfs_set_lock_blocking(next); 1799 btrfs_set_lock_blocking(next);
1800 clean_tree_block(trans, root, next);
1801 btrfs_wait_tree_block_writeback(next); 1801 btrfs_wait_tree_block_writeback(next);
1802 btrfs_tree_unlock(next); 1802 btrfs_tree_unlock(next);
1803 1803
@@ -1864,8 +1864,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1864 next = path->nodes[orig_level]; 1864 next = path->nodes[orig_level];
1865 1865
1866 btrfs_tree_lock(next); 1866 btrfs_tree_lock(next);
1867 clean_tree_block(trans, log, next);
1868 btrfs_set_lock_blocking(next); 1867 btrfs_set_lock_blocking(next);
1868 clean_tree_block(trans, log, next);
1869 btrfs_wait_tree_block_writeback(next); 1869 btrfs_wait_tree_block_writeback(next);
1870 btrfs_tree_unlock(next); 1870 btrfs_tree_unlock(next);
1871 1871
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 19450bc5363..b89e372c754 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3595,7 +3595,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
3595 if (!sb) 3595 if (!sb)
3596 return -ENOMEM; 3596 return -ENOMEM;
3597 btrfs_set_buffer_uptodate(sb); 3597 btrfs_set_buffer_uptodate(sb);
3598 btrfs_set_buffer_lockdep_class(sb, 0); 3598 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
3599 3599
3600 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3600 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3601 array_size = btrfs_super_sys_array_size(super_copy); 3601 array_size = btrfs_super_sys_array_size(super_copy);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 5366fe452ab..d733b9cfea3 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -102,43 +102,57 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
102 if (!path) 102 if (!path)
103 return -ENOMEM; 103 return -ENOMEM;
104 104
105 /* first lets see if we already have this xattr */ 105 if (flags & XATTR_REPLACE) {
106 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, 106 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
107 strlen(name), -1); 107 name_len, -1);
108 if (IS_ERR(di)) { 108 if (IS_ERR(di)) {
109 ret = PTR_ERR(di); 109 ret = PTR_ERR(di);
110 goto out; 110 goto out;
111 } 111 } else if (!di) {
112 112 ret = -ENODATA;
113 /* ok we already have this xattr, lets remove it */
114 if (di) {
115 /* if we want create only exit */
116 if (flags & XATTR_CREATE) {
117 ret = -EEXIST;
118 goto out; 113 goto out;
119 } 114 }
120
121 ret = btrfs_delete_one_dir_name(trans, root, path, di); 115 ret = btrfs_delete_one_dir_name(trans, root, path, di);
122 BUG_ON(ret); 116 if (ret)
117 goto out;
123 btrfs_release_path(path); 118 btrfs_release_path(path);
119 }
124 120
125 /* if we don't have a value then we are removing the xattr */ 121again:
126 if (!value) 122 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
123 name, name_len, value, size);
124 if (ret == -EEXIST) {
125 if (flags & XATTR_CREATE)
127 goto out; 126 goto out;
128 } else { 127 /*
128 * We can't use the path we already have since we won't have the
129 * proper locking for a delete, so release the path and
130 * re-lookup to delete the thing.
131 */
129 btrfs_release_path(path); 132 btrfs_release_path(path);
133 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
134 name, name_len, -1);
135 if (IS_ERR(di)) {
136 ret = PTR_ERR(di);
137 goto out;
138 } else if (!di) {
139 /* Shouldn't happen but just in case... */
140 btrfs_release_path(path);
141 goto again;
142 }
130 143
131 if (flags & XATTR_REPLACE) { 144 ret = btrfs_delete_one_dir_name(trans, root, path, di);
132 /* we couldn't find the attr to replace */ 145 if (ret)
133 ret = -ENODATA;
134 goto out; 146 goto out;
147
148 /*
149 * We have a value to set, so go back and try to insert it now.
150 */
151 if (value) {
152 btrfs_release_path(path);
153 goto again;
135 } 154 }
136 } 155 }
137
138 /* ok we have to create a completely new xattr */
139 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
140 name, name_len, value, size);
141 BUG_ON(ret);
142out: 156out:
143 btrfs_free_path(path); 157 btrfs_free_path(path);
144 return ret; 158 return ret;