aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-04-03 18:14:44 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-04-03 18:14:44 -0400
commitb983471794e568fd71fa767da77a62ba517c3e63 (patch)
tree92a1cc26c4846b49d90225d004ba1b7bd6fe3d81 /fs/btrfs
parent5a3ae276057840f0e60664c12fc3ef80aa59d1d4 (diff)
parentc293498be69816087746161338de4b81efdf69fc (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: Btrfs: BUG to BUG_ON changes Btrfs: remove dead code Btrfs: remove dead code Btrfs: fix typos in comments Btrfs: remove unused ftrace include Btrfs: fix __ucmpdi2 compile bug on 32 bit builds Btrfs: free inode struct when btrfs_new_inode fails Btrfs: fix race in worker_loop Btrfs: add flushoncommit mount option Btrfs: notreelog mount option Btrfs: introduce btrfs_show_options Btrfs: rework allocation clustering Btrfs: Optimize locking in btrfs_next_leaf() Btrfs: break up btrfs_search_slot into smaller pieces Btrfs: kill the pinned_mutex Btrfs: kill the block group alloc mutex Btrfs: clean up find_free_extent Btrfs: free space cache cleanups Btrfs: unplug in the async bio submission threads Btrfs: keep processing bios for a given bdev if our proc is batching
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/async-thread.c7
-rw-r--r--fs/btrfs/ctree.c312
-rw-r--r--fs/btrfs/ctree.h84
-rw-r--r--fs/btrfs/delayed-ref.c1
-rw-r--r--fs/btrfs/disk-io.c8
-rw-r--r--fs/btrfs/extent-tree.c398
-rw-r--r--fs/btrfs/extent_io.c16
-rw-r--r--fs/btrfs/extent_map.c1
-rw-r--r--fs/btrfs/free-space-cache.c530
-rw-r--r--fs/btrfs/free-space-cache.h44
-rw-r--r--fs/btrfs/inode.c5
-rw-r--r--fs/btrfs/locking.c4
-rw-r--r--fs/btrfs/super.c54
-rw-r--r--fs/btrfs/transaction.c7
-rw-r--r--fs/btrfs/tree-log.c12
-rw-r--r--fs/btrfs/volumes.c41
-rw-r--r--fs/btrfs/volumes.h2
17 files changed, 982 insertions, 544 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c84ca1f5259a..51bfdfc8fcda 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -20,7 +20,6 @@
20#include <linux/list.h> 20#include <linux/list.h>
21#include <linux/spinlock.h> 21#include <linux/spinlock.h>
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/ftrace.h>
24#include "async-thread.h" 23#include "async-thread.h"
25 24
26#define WORK_QUEUED_BIT 0 25#define WORK_QUEUED_BIT 0
@@ -195,6 +194,9 @@ again_locked:
195 if (!list_empty(&worker->pending)) 194 if (!list_empty(&worker->pending))
196 continue; 195 continue;
197 196
197 if (kthread_should_stop())
198 break;
199
198 /* still no more work?, sleep for real */ 200 /* still no more work?, sleep for real */
199 spin_lock_irq(&worker->lock); 201 spin_lock_irq(&worker->lock);
200 set_current_state(TASK_INTERRUPTIBLE); 202 set_current_state(TASK_INTERRUPTIBLE);
@@ -208,7 +210,8 @@ again_locked:
208 worker->working = 0; 210 worker->working = 0;
209 spin_unlock_irq(&worker->lock); 211 spin_unlock_irq(&worker->lock);
210 212
211 schedule(); 213 if (!kthread_should_stop())
214 schedule();
212 } 215 }
213 __set_current_state(TASK_RUNNING); 216 __set_current_state(TASK_RUNNING);
214 } 217 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dbb724124633..e5b2533b691a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1244,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1244 * readahead one full node of leaves, finding things that are close 1244 * readahead one full node of leaves, finding things that are close
1245 * to the block in 'slot', and triggering ra on them. 1245 * to the block in 'slot', and triggering ra on them.
1246 */ 1246 */
1247static noinline void reada_for_search(struct btrfs_root *root, 1247static void reada_for_search(struct btrfs_root *root,
1248 struct btrfs_path *path, 1248 struct btrfs_path *path,
1249 int level, int slot, u64 objectid) 1249 int level, int slot, u64 objectid)
1250{ 1250{
1251 struct extent_buffer *node; 1251 struct extent_buffer *node;
1252 struct btrfs_disk_key disk_key; 1252 struct btrfs_disk_key disk_key;
@@ -1447,6 +1447,117 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1447} 1447}
1448 1448
1449/* 1449/*
1450 * helper function for btrfs_search_slot. The goal is to find a block
1451 * in cache without setting the path to blocking. If we find the block
1452 * we return zero and the path is unchanged.
1453 *
1454 * If we can't find the block, we set the path blocking and do some
1455 * reada. -EAGAIN is returned and the search must be repeated.
1456 */
1457static int
1458read_block_for_search(struct btrfs_trans_handle *trans,
1459 struct btrfs_root *root, struct btrfs_path *p,
1460 struct extent_buffer **eb_ret, int level, int slot,
1461 struct btrfs_key *key)
1462{
1463 u64 blocknr;
1464 u64 gen;
1465 u32 blocksize;
1466 struct extent_buffer *b = *eb_ret;
1467 struct extent_buffer *tmp;
1468
1469 blocknr = btrfs_node_blockptr(b, slot);
1470 gen = btrfs_node_ptr_generation(b, slot);
1471 blocksize = btrfs_level_size(root, level - 1);
1472
1473 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1474 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1475 *eb_ret = tmp;
1476 return 0;
1477 }
1478
1479 /*
1480 * reduce lock contention at high levels
1481 * of the btree by dropping locks before
1482 * we read.
1483 */
1484 btrfs_release_path(NULL, p);
1485 if (tmp)
1486 free_extent_buffer(tmp);
1487 if (p->reada)
1488 reada_for_search(root, p, level, slot, key->objectid);
1489
1490 tmp = read_tree_block(root, blocknr, blocksize, gen);
1491 if (tmp)
1492 free_extent_buffer(tmp);
1493 return -EAGAIN;
1494}
1495
1496/*
1497 * helper function for btrfs_search_slot. This does all of the checks
1498 * for node-level blocks and does any balancing required based on
1499 * the ins_len.
1500 *
1501 * If no extra work was required, zero is returned. If we had to
1502 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1503 * start over
1504 */
1505static int
1506setup_nodes_for_search(struct btrfs_trans_handle *trans,
1507 struct btrfs_root *root, struct btrfs_path *p,
1508 struct extent_buffer *b, int level, int ins_len)
1509{
1510 int ret;
1511 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1512 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1513 int sret;
1514
1515 sret = reada_for_balance(root, p, level);
1516 if (sret)
1517 goto again;
1518
1519 btrfs_set_path_blocking(p);
1520 sret = split_node(trans, root, p, level);
1521 btrfs_clear_path_blocking(p, NULL);
1522
1523 BUG_ON(sret > 0);
1524 if (sret) {
1525 ret = sret;
1526 goto done;
1527 }
1528 b = p->nodes[level];
1529 } else if (ins_len < 0 && btrfs_header_nritems(b) <
1530 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1531 int sret;
1532
1533 sret = reada_for_balance(root, p, level);
1534 if (sret)
1535 goto again;
1536
1537 btrfs_set_path_blocking(p);
1538 sret = balance_level(trans, root, p, level);
1539 btrfs_clear_path_blocking(p, NULL);
1540
1541 if (sret) {
1542 ret = sret;
1543 goto done;
1544 }
1545 b = p->nodes[level];
1546 if (!b) {
1547 btrfs_release_path(NULL, p);
1548 goto again;
1549 }
1550 BUG_ON(btrfs_header_nritems(b) == 1);
1551 }
1552 return 0;
1553
1554again:
1555 ret = -EAGAIN;
1556done:
1557 return ret;
1558}
1559
1560/*
1450 * look for key in the tree. path is filled in with nodes along the way 1561 * look for key in the tree. path is filled in with nodes along the way
1451 * if key is found, we return zero and you can find the item in the leaf 1562 * if key is found, we return zero and you can find the item in the leaf
1452 * level of the path (level 0) 1563 * level of the path (level 0)
@@ -1464,16 +1575,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1464 ins_len, int cow) 1575 ins_len, int cow)
1465{ 1576{
1466 struct extent_buffer *b; 1577 struct extent_buffer *b;
1467 struct extent_buffer *tmp;
1468 int slot; 1578 int slot;
1469 int ret; 1579 int ret;
1470 int level; 1580 int level;
1471 int should_reada = p->reada;
1472 int lowest_unlock = 1; 1581 int lowest_unlock = 1;
1473 int blocksize;
1474 u8 lowest_level = 0; 1582 u8 lowest_level = 0;
1475 u64 blocknr;
1476 u64 gen;
1477 1583
1478 lowest_level = p->lowest_level; 1584 lowest_level = p->lowest_level;
1479 WARN_ON(lowest_level && ins_len > 0); 1585 WARN_ON(lowest_level && ins_len > 0);
@@ -1502,7 +1608,11 @@ again:
1502 if (cow) { 1608 if (cow) {
1503 int wret; 1609 int wret;
1504 1610
1505 /* is a cow on this block not required */ 1611 /*
1612 * if we don't really need to cow this block
1613 * then we don't want to set the path blocking,
1614 * so we test it here
1615 */
1506 if (btrfs_header_generation(b) == trans->transid && 1616 if (btrfs_header_generation(b) == trans->transid &&
1507 btrfs_header_owner(b) == root->root_key.objectid && 1617 btrfs_header_owner(b) == root->root_key.objectid &&
1508 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { 1618 !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
@@ -1557,51 +1667,15 @@ cow_done:
1557 if (ret && slot > 0) 1667 if (ret && slot > 0)
1558 slot -= 1; 1668 slot -= 1;
1559 p->slots[level] = slot; 1669 p->slots[level] = slot;
1560 if ((p->search_for_split || ins_len > 0) && 1670 ret = setup_nodes_for_search(trans, root, p, b, level,
1561 btrfs_header_nritems(b) >= 1671 ins_len);
1562 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1672 if (ret == -EAGAIN)
1563 int sret; 1673 goto again;
1564 1674 else if (ret)
1565 sret = reada_for_balance(root, p, level); 1675 goto done;
1566 if (sret) 1676 b = p->nodes[level];
1567 goto again; 1677 slot = p->slots[level];
1568
1569 btrfs_set_path_blocking(p);
1570 sret = split_node(trans, root, p, level);
1571 btrfs_clear_path_blocking(p, NULL);
1572
1573 BUG_ON(sret > 0);
1574 if (sret) {
1575 ret = sret;
1576 goto done;
1577 }
1578 b = p->nodes[level];
1579 slot = p->slots[level];
1580 } else if (ins_len < 0 &&
1581 btrfs_header_nritems(b) <
1582 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
1583 int sret;
1584
1585 sret = reada_for_balance(root, p, level);
1586 if (sret)
1587 goto again;
1588
1589 btrfs_set_path_blocking(p);
1590 sret = balance_level(trans, root, p, level);
1591 btrfs_clear_path_blocking(p, NULL);
1592 1678
1593 if (sret) {
1594 ret = sret;
1595 goto done;
1596 }
1597 b = p->nodes[level];
1598 if (!b) {
1599 btrfs_release_path(NULL, p);
1600 goto again;
1601 }
1602 slot = p->slots[level];
1603 BUG_ON(btrfs_header_nritems(b) == 1);
1604 }
1605 unlock_up(p, level, lowest_unlock); 1679 unlock_up(p, level, lowest_unlock);
1606 1680
1607 /* this is only true while dropping a snapshot */ 1681 /* this is only true while dropping a snapshot */
@@ -1610,44 +1684,11 @@ cow_done:
1610 goto done; 1684 goto done;
1611 } 1685 }
1612 1686
1613 blocknr = btrfs_node_blockptr(b, slot); 1687 ret = read_block_for_search(trans, root, p,
1614 gen = btrfs_node_ptr_generation(b, slot); 1688 &b, level, slot, key);
1615 blocksize = btrfs_level_size(root, level - 1); 1689 if (ret == -EAGAIN)
1690 goto again;
1616 1691
1617 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1618 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1619 b = tmp;
1620 } else {
1621 /*
1622 * reduce lock contention at high levels
1623 * of the btree by dropping locks before
1624 * we read.
1625 */
1626 if (level > 0) {
1627 btrfs_release_path(NULL, p);
1628 if (tmp)
1629 free_extent_buffer(tmp);
1630 if (should_reada)
1631 reada_for_search(root, p,
1632 level, slot,
1633 key->objectid);
1634
1635 tmp = read_tree_block(root, blocknr,
1636 blocksize, gen);
1637 if (tmp)
1638 free_extent_buffer(tmp);
1639 goto again;
1640 } else {
1641 btrfs_set_path_blocking(p);
1642 if (tmp)
1643 free_extent_buffer(tmp);
1644 if (should_reada)
1645 reada_for_search(root, p,
1646 level, slot,
1647 key->objectid);
1648 b = read_node_slot(root, b, slot);
1649 }
1650 }
1651 if (!p->skip_locking) { 1692 if (!p->skip_locking) {
1652 int lret; 1693 int lret;
1653 1694
@@ -2116,8 +2157,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2116 BUG_ON(!path->nodes[level]); 2157 BUG_ON(!path->nodes[level]);
2117 lower = path->nodes[level]; 2158 lower = path->nodes[level];
2118 nritems = btrfs_header_nritems(lower); 2159 nritems = btrfs_header_nritems(lower);
2119 if (slot > nritems) 2160 BUG_ON(slot > nritems);
2120 BUG();
2121 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) 2161 if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
2122 BUG(); 2162 BUG();
2123 if (slot != nritems) { 2163 if (slot != nritems) {
@@ -4086,28 +4126,44 @@ next:
4086int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) 4126int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4087{ 4127{
4088 int slot; 4128 int slot;
4089 int level = 1; 4129 int level;
4090 struct extent_buffer *c; 4130 struct extent_buffer *c;
4091 struct extent_buffer *next = NULL; 4131 struct extent_buffer *next;
4092 struct btrfs_key key; 4132 struct btrfs_key key;
4093 u32 nritems; 4133 u32 nritems;
4094 int ret; 4134 int ret;
4135 int old_spinning = path->leave_spinning;
4136 int force_blocking = 0;
4095 4137
4096 nritems = btrfs_header_nritems(path->nodes[0]); 4138 nritems = btrfs_header_nritems(path->nodes[0]);
4097 if (nritems == 0) 4139 if (nritems == 0)
4098 return 1; 4140 return 1;
4099 4141
4100 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4142 /*
4143 * we take the blocks in an order that upsets lockdep. Using
4144 * blocking mode is the only way around it.
4145 */
4146#ifdef CONFIG_DEBUG_LOCK_ALLOC
4147 force_blocking = 1;
4148#endif
4101 4149
4150 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4151again:
4152 level = 1;
4153 next = NULL;
4102 btrfs_release_path(root, path); 4154 btrfs_release_path(root, path);
4155
4103 path->keep_locks = 1; 4156 path->keep_locks = 1;
4157
4158 if (!force_blocking)
4159 path->leave_spinning = 1;
4160
4104 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4161 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4105 path->keep_locks = 0; 4162 path->keep_locks = 0;
4106 4163
4107 if (ret < 0) 4164 if (ret < 0)
4108 return ret; 4165 return ret;
4109 4166
4110 btrfs_set_path_blocking(path);
4111 nritems = btrfs_header_nritems(path->nodes[0]); 4167 nritems = btrfs_header_nritems(path->nodes[0]);
4112 /* 4168 /*
4113 * by releasing the path above we dropped all our locks. A balance 4169 * by releasing the path above we dropped all our locks. A balance
@@ -4117,19 +4173,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4117 */ 4173 */
4118 if (nritems > 0 && path->slots[0] < nritems - 1) { 4174 if (nritems > 0 && path->slots[0] < nritems - 1) {
4119 path->slots[0]++; 4175 path->slots[0]++;
4176 ret = 0;
4120 goto done; 4177 goto done;
4121 } 4178 }
4122 4179
4123 while (level < BTRFS_MAX_LEVEL) { 4180 while (level < BTRFS_MAX_LEVEL) {
4124 if (!path->nodes[level]) 4181 if (!path->nodes[level]) {
4125 return 1; 4182 ret = 1;
4183 goto done;
4184 }
4126 4185
4127 slot = path->slots[level] + 1; 4186 slot = path->slots[level] + 1;
4128 c = path->nodes[level]; 4187 c = path->nodes[level];
4129 if (slot >= btrfs_header_nritems(c)) { 4188 if (slot >= btrfs_header_nritems(c)) {
4130 level++; 4189 level++;
4131 if (level == BTRFS_MAX_LEVEL) 4190 if (level == BTRFS_MAX_LEVEL) {
4132 return 1; 4191 ret = 1;
4192 goto done;
4193 }
4133 continue; 4194 continue;
4134 } 4195 }
4135 4196
@@ -4138,16 +4199,22 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4138 free_extent_buffer(next); 4199 free_extent_buffer(next);
4139 } 4200 }
4140 4201
4141 /* the path was set to blocking above */ 4202 next = c;
4142 if (level == 1 && (path->locks[1] || path->skip_locking) && 4203 ret = read_block_for_search(NULL, root, path, &next, level,
4143 path->reada) 4204 slot, &key);
4144 reada_for_search(root, path, level, slot, 0); 4205 if (ret == -EAGAIN)
4206 goto again;
4145 4207
4146 next = read_node_slot(root, c, slot);
4147 if (!path->skip_locking) { 4208 if (!path->skip_locking) {
4148 btrfs_assert_tree_locked(c); 4209 ret = btrfs_try_spin_lock(next);
4149 btrfs_tree_lock(next); 4210 if (!ret) {
4150 btrfs_set_lock_blocking(next); 4211 btrfs_set_path_blocking(path);
4212 btrfs_tree_lock(next);
4213 if (!force_blocking)
4214 btrfs_clear_path_blocking(path, next);
4215 }
4216 if (force_blocking)
4217 btrfs_set_lock_blocking(next);
4151 } 4218 }
4152 break; 4219 break;
4153 } 4220 }
@@ -4157,27 +4224,42 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4157 c = path->nodes[level]; 4224 c = path->nodes[level];
4158 if (path->locks[level]) 4225 if (path->locks[level])
4159 btrfs_tree_unlock(c); 4226 btrfs_tree_unlock(c);
4227
4160 free_extent_buffer(c); 4228 free_extent_buffer(c);
4161 path->nodes[level] = next; 4229 path->nodes[level] = next;
4162 path->slots[level] = 0; 4230 path->slots[level] = 0;
4163 if (!path->skip_locking) 4231 if (!path->skip_locking)
4164 path->locks[level] = 1; 4232 path->locks[level] = 1;
4233
4165 if (!level) 4234 if (!level)
4166 break; 4235 break;
4167 4236
4168 btrfs_set_path_blocking(path); 4237 ret = read_block_for_search(NULL, root, path, &next, level,
4169 if (level == 1 && path->locks[1] && path->reada) 4238 0, &key);
4170 reada_for_search(root, path, level, slot, 0); 4239 if (ret == -EAGAIN)
4171 next = read_node_slot(root, next, 0); 4240 goto again;
4241
4172 if (!path->skip_locking) { 4242 if (!path->skip_locking) {
4173 btrfs_assert_tree_locked(path->nodes[level]); 4243 btrfs_assert_tree_locked(path->nodes[level]);
4174 btrfs_tree_lock(next); 4244 ret = btrfs_try_spin_lock(next);
4175 btrfs_set_lock_blocking(next); 4245 if (!ret) {
4246 btrfs_set_path_blocking(path);
4247 btrfs_tree_lock(next);
4248 if (!force_blocking)
4249 btrfs_clear_path_blocking(path, next);
4250 }
4251 if (force_blocking)
4252 btrfs_set_lock_blocking(next);
4176 } 4253 }
4177 } 4254 }
4255 ret = 0;
4178done: 4256done:
4179 unlock_up(path, 0, 1); 4257 unlock_up(path, 0, 1);
4180 return 0; 4258 path->leave_spinning = old_spinning;
4259 if (!old_spinning)
4260 btrfs_set_path_blocking(path);
4261
4262 return ret;
4181} 4263}
4182 4264
4183/* 4265/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9417713542a2..ad96495dedc5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -143,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 };
143#define BTRFS_FT_MAX 9 143#define BTRFS_FT_MAX 9
144 144
145/* 145/*
146 * the key defines the order in the tree, and so it also defines (optimal) 146 * The key defines the order in the tree, and so it also defines (optimal)
147 * block layout. objectid corresonds to the inode number. The flags 147 * block layout.
148 * tells us things about the object, and is a kind of stream selector. 148 *
149 * so for a given inode, keys with flags of 1 might refer to the inode 149 * objectid corresponds to the inode number.
150 * data, flags of 2 may point to file data in the btree and flags == 3 150 *
151 * may point to extents. 151 * type tells us things about the object, and is a kind of stream selector.
152 * so for a given inode, keys with type of 1 might refer to the inode data,
153 * type of 2 may point to file data in the btree and type == 3 may point to
154 * extents.
152 * 155 *
153 * offset is the starting byte offset for this key in the stream. 156 * offset is the starting byte offset for this key in the stream.
154 * 157 *
@@ -200,7 +203,7 @@ struct btrfs_dev_item {
200 203
201 /* 204 /*
202 * starting byte of this partition on the device, 205 * starting byte of this partition on the device,
203 * to allowr for stripe alignment in the future 206 * to allow for stripe alignment in the future
204 */ 207 */
205 __le64 start_offset; 208 __le64 start_offset;
206 209
@@ -633,18 +636,35 @@ struct btrfs_space_info {
633 struct rw_semaphore groups_sem; 636 struct rw_semaphore groups_sem;
634}; 637};
635 638
636struct btrfs_free_space { 639/*
637 struct rb_node bytes_index; 640 * free clusters are used to claim free space in relatively large chunks,
638 struct rb_node offset_index; 641 * allowing us to do less seeky writes. They are used for all metadata
639 u64 offset; 642 * allocations and data allocations in ssd mode.
640 u64 bytes; 643 */
644struct btrfs_free_cluster {
645 spinlock_t lock;
646 spinlock_t refill_lock;
647 struct rb_root root;
648
649 /* largest extent in this cluster */
650 u64 max_size;
651
652 /* first extent starting offset */
653 u64 window_start;
654
655 struct btrfs_block_group_cache *block_group;
656 /*
657 * when a cluster is allocated from a block group, we put the
658 * cluster onto a list in the block group so that it can
659 * be freed before the block group is freed.
660 */
661 struct list_head block_group_list;
641}; 662};
642 663
643struct btrfs_block_group_cache { 664struct btrfs_block_group_cache {
644 struct btrfs_key key; 665 struct btrfs_key key;
645 struct btrfs_block_group_item item; 666 struct btrfs_block_group_item item;
646 spinlock_t lock; 667 spinlock_t lock;
647 struct mutex alloc_mutex;
648 struct mutex cache_mutex; 668 struct mutex cache_mutex;
649 u64 pinned; 669 u64 pinned;
650 u64 reserved; 670 u64 reserved;
@@ -656,6 +676,7 @@ struct btrfs_block_group_cache {
656 struct btrfs_space_info *space_info; 676 struct btrfs_space_info *space_info;
657 677
658 /* free space cache stuff */ 678 /* free space cache stuff */
679 spinlock_t tree_lock;
659 struct rb_root free_space_bytes; 680 struct rb_root free_space_bytes;
660 struct rb_root free_space_offset; 681 struct rb_root free_space_offset;
661 682
@@ -667,6 +688,11 @@ struct btrfs_block_group_cache {
667 688
668 /* usage count */ 689 /* usage count */
669 atomic_t count; 690 atomic_t count;
691
692 /* List of struct btrfs_free_clusters for this block group.
693 * Today it will only have one thing on it, but that may change
694 */
695 struct list_head cluster_list;
670}; 696};
671 697
672struct btrfs_leaf_ref_tree { 698struct btrfs_leaf_ref_tree {
@@ -728,7 +754,6 @@ struct btrfs_fs_info {
728 struct mutex tree_log_mutex; 754 struct mutex tree_log_mutex;
729 struct mutex transaction_kthread_mutex; 755 struct mutex transaction_kthread_mutex;
730 struct mutex cleaner_mutex; 756 struct mutex cleaner_mutex;
731 struct mutex pinned_mutex;
732 struct mutex chunk_mutex; 757 struct mutex chunk_mutex;
733 struct mutex drop_mutex; 758 struct mutex drop_mutex;
734 struct mutex volume_mutex; 759 struct mutex volume_mutex;
@@ -839,8 +864,12 @@ struct btrfs_fs_info {
839 spinlock_t delalloc_lock; 864 spinlock_t delalloc_lock;
840 spinlock_t new_trans_lock; 865 spinlock_t new_trans_lock;
841 u64 delalloc_bytes; 866 u64 delalloc_bytes;
842 u64 last_alloc; 867
843 u64 last_data_alloc; 868 /* data_alloc_cluster is only used in ssd mode */
869 struct btrfs_free_cluster data_alloc_cluster;
870
871 /* all metadata allocations go through this cluster */
872 struct btrfs_free_cluster meta_alloc_cluster;
844 873
845 spinlock_t ref_cache_lock; 874 spinlock_t ref_cache_lock;
846 u64 total_ref_cache_size; 875 u64 total_ref_cache_size;
@@ -932,7 +961,6 @@ struct btrfs_root {
932}; 961};
933 962
934/* 963/*
935
936 * inode items have the data typically returned from stat and store other 964 * inode items have the data typically returned from stat and store other
937 * info about object characteristics. There is one for every file and dir in 965 * info about object characteristics. There is one for every file and dir in
938 * the FS 966 * the FS
@@ -963,7 +991,7 @@ struct btrfs_root {
963#define BTRFS_EXTENT_CSUM_KEY 128 991#define BTRFS_EXTENT_CSUM_KEY 128
964 992
965/* 993/*
966 * root items point to tree roots. There are typically in the root 994 * root items point to tree roots. They are typically in the root
967 * tree used by the super block to find all the other trees 995 * tree used by the super block to find all the other trees
968 */ 996 */
969#define BTRFS_ROOT_ITEM_KEY 132 997#define BTRFS_ROOT_ITEM_KEY 132
@@ -1010,6 +1038,8 @@ struct btrfs_root {
1010#define BTRFS_MOUNT_SSD (1 << 3) 1038#define BTRFS_MOUNT_SSD (1 << 3)
1011#define BTRFS_MOUNT_DEGRADED (1 << 4) 1039#define BTRFS_MOUNT_DEGRADED (1 << 4)
1012#define BTRFS_MOUNT_COMPRESS (1 << 5) 1040#define BTRFS_MOUNT_COMPRESS (1 << 5)
1041#define BTRFS_MOUNT_NOTREELOG (1 << 6)
1042#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
1013 1043
1014#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1044#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1015#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1045#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1748,6 +1778,7 @@ static inline struct dentry *fdentry(struct file *file)
1748} 1778}
1749 1779
1750/* extent-tree.c */ 1780/* extent-tree.c */
1781void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1751int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1782int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1752 struct btrfs_root *root, unsigned long count); 1783 struct btrfs_root *root, unsigned long count);
1753int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1784int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
@@ -2174,21 +2205,4 @@ int btrfs_check_acl(struct inode *inode, int mask);
2174int btrfs_init_acl(struct inode *inode, struct inode *dir); 2205int btrfs_init_acl(struct inode *inode, struct inode *dir);
2175int btrfs_acl_chmod(struct inode *inode); 2206int btrfs_acl_chmod(struct inode *inode);
2176 2207
2177/* free-space-cache.c */
2178int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2179 u64 bytenr, u64 size);
2180int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
2181 u64 offset, u64 bytes);
2182int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2183 u64 bytenr, u64 size);
2184int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
2185 u64 offset, u64 bytes);
2186void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
2187 *block_group);
2188struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
2189 *block_group, u64 offset,
2190 u64 bytes);
2191void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2192 u64 bytes);
2193u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
2194#endif 2208#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index cbf7dc8ae3ec..d6c01c096a40 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -18,7 +18,6 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/sort.h> 20#include <linux/sort.h>
21#include <linux/ftrace.h>
22#include "ctree.h" 21#include "ctree.h"
23#include "delayed-ref.h" 22#include "delayed-ref.h"
24#include "transaction.h" 23#include "transaction.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92d73929d381..92caa8035f36 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -38,6 +38,7 @@
38#include "locking.h" 38#include "locking.h"
39#include "ref-cache.h" 39#include "ref-cache.h"
40#include "tree-log.h" 40#include "tree-log.h"
41#include "free-space-cache.h"
41 42
42static struct extent_io_ops btree_extent_io_ops; 43static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 44static void end_workqueue_fn(struct btrfs_work *work);
@@ -1412,8 +1413,6 @@ static int bio_ready_for_csum(struct bio *bio)
1412 1413
1413 ret = extent_range_uptodate(io_tree, start + length, 1414 ret = extent_range_uptodate(io_tree, start + length,
1414 start + buf_len - 1); 1415 start + buf_len - 1);
1415 if (ret == 1)
1416 return ret;
1417 return ret; 1416 return ret;
1418} 1417}
1419 1418
@@ -1647,12 +1646,15 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1647 mutex_init(&fs_info->ordered_operations_mutex); 1646 mutex_init(&fs_info->ordered_operations_mutex);
1648 mutex_init(&fs_info->tree_log_mutex); 1647 mutex_init(&fs_info->tree_log_mutex);
1649 mutex_init(&fs_info->drop_mutex); 1648 mutex_init(&fs_info->drop_mutex);
1650 mutex_init(&fs_info->pinned_mutex);
1651 mutex_init(&fs_info->chunk_mutex); 1649 mutex_init(&fs_info->chunk_mutex);
1652 mutex_init(&fs_info->transaction_kthread_mutex); 1650 mutex_init(&fs_info->transaction_kthread_mutex);
1653 mutex_init(&fs_info->cleaner_mutex); 1651 mutex_init(&fs_info->cleaner_mutex);
1654 mutex_init(&fs_info->volume_mutex); 1652 mutex_init(&fs_info->volume_mutex);
1655 mutex_init(&fs_info->tree_reloc_mutex); 1653 mutex_init(&fs_info->tree_reloc_mutex);
1654
1655 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
1656 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
1657
1656 init_waitqueue_head(&fs_info->transaction_throttle); 1658 init_waitqueue_head(&fs_info->transaction_throttle);
1657 init_waitqueue_head(&fs_info->transaction_wait); 1659 init_waitqueue_head(&fs_info->transaction_wait);
1658 init_waitqueue_head(&fs_info->async_submit_wait); 1660 init_waitqueue_head(&fs_info->async_submit_wait);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5e7cae63d80..178df4c67de4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "volumes.h" 31#include "volumes.h"
32#include "locking.h" 32#include "locking.h"
33#include "ref-cache.h" 33#include "ref-cache.h"
34#include "free-space-cache.h"
34 35
35#define PENDING_EXTENT_INSERT 0 36#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 37#define PENDING_EXTENT_DELETE 1
@@ -166,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
166 u64 extent_start, extent_end, size; 167 u64 extent_start, extent_end, size;
167 int ret; 168 int ret;
168 169
169 mutex_lock(&info->pinned_mutex);
170 while (start < end) { 170 while (start < end) {
171 ret = find_first_extent_bit(&info->pinned_extents, start, 171 ret = find_first_extent_bit(&info->pinned_extents, start,
172 &extent_start, &extent_end, 172 &extent_start, &extent_end,
@@ -192,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
192 ret = btrfs_add_free_space(block_group, start, size); 192 ret = btrfs_add_free_space(block_group, start, size);
193 BUG_ON(ret); 193 BUG_ON(ret);
194 } 194 }
195 mutex_unlock(&info->pinned_mutex);
196 195
197 return 0; 196 return 0;
198} 197}
@@ -291,8 +290,8 @@ next:
291 block_group->key.objectid + 290 block_group->key.objectid +
292 block_group->key.offset); 291 block_group->key.offset);
293 292
294 remove_sb_from_cache(root, block_group);
295 block_group->cached = 1; 293 block_group->cached = 1;
294 remove_sb_from_cache(root, block_group);
296 ret = 0; 295 ret = 0;
297err: 296err:
298 btrfs_free_path(path); 297 btrfs_free_path(path);
@@ -326,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
326 return cache; 325 return cache;
327} 326}
328 327
329static inline void put_block_group(struct btrfs_block_group_cache *cache) 328void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
330{ 329{
331 if (atomic_dec_and_test(&cache->count)) 330 if (atomic_dec_and_test(&cache->count))
332 kfree(cache); 331 kfree(cache);
@@ -399,12 +398,12 @@ again:
399 div_factor(cache->key.offset, factor)) { 398 div_factor(cache->key.offset, factor)) {
400 group_start = cache->key.objectid; 399 group_start = cache->key.objectid;
401 spin_unlock(&cache->lock); 400 spin_unlock(&cache->lock);
402 put_block_group(cache); 401 btrfs_put_block_group(cache);
403 goto found; 402 goto found;
404 } 403 }
405 } 404 }
406 spin_unlock(&cache->lock); 405 spin_unlock(&cache->lock);
407 put_block_group(cache); 406 btrfs_put_block_group(cache);
408 cond_resched(); 407 cond_resched();
409 } 408 }
410 if (!wrapped) { 409 if (!wrapped) {
@@ -1594,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1594 if (!block_group || block_group->ro) 1593 if (!block_group || block_group->ro)
1595 readonly = 1; 1594 readonly = 1;
1596 if (block_group) 1595 if (block_group)
1597 put_block_group(block_group); 1596 btrfs_put_block_group(block_group);
1598 return readonly; 1597 return readonly;
1599} 1598}
1600 1599
@@ -2018,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
2018 WARN_ON(ret); 2017 WARN_ON(ret);
2019 } 2018 }
2020 } 2019 }
2021 put_block_group(cache); 2020 btrfs_put_block_group(cache);
2022 total -= num_bytes; 2021 total -= num_bytes;
2023 bytenr += num_bytes; 2022 bytenr += num_bytes;
2024 } 2023 }
@@ -2035,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2035 return 0; 2034 return 0;
2036 2035
2037 bytenr = cache->key.objectid; 2036 bytenr = cache->key.objectid;
2038 put_block_group(cache); 2037 btrfs_put_block_group(cache);
2039 2038
2040 return bytenr; 2039 return bytenr;
2041} 2040}
@@ -2047,7 +2046,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2047 struct btrfs_block_group_cache *cache; 2046 struct btrfs_block_group_cache *cache;
2048 struct btrfs_fs_info *fs_info = root->fs_info; 2047 struct btrfs_fs_info *fs_info = root->fs_info;
2049 2048
2050 WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2051 if (pin) { 2049 if (pin) {
2052 set_extent_dirty(&fs_info->pinned_extents, 2050 set_extent_dirty(&fs_info->pinned_extents,
2053 bytenr, bytenr + num - 1, GFP_NOFS); 2051 bytenr, bytenr + num - 1, GFP_NOFS);
@@ -2055,7 +2053,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2055 clear_extent_dirty(&fs_info->pinned_extents, 2053 clear_extent_dirty(&fs_info->pinned_extents,
2056 bytenr, bytenr + num - 1, GFP_NOFS); 2054 bytenr, bytenr + num - 1, GFP_NOFS);
2057 } 2055 }
2058 mutex_unlock(&root->fs_info->pinned_mutex);
2059 2056
2060 while (num > 0) { 2057 while (num > 0) {
2061 cache = btrfs_lookup_block_group(fs_info, bytenr); 2058 cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2081,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2081 if (cache->cached) 2078 if (cache->cached)
2082 btrfs_add_free_space(cache, bytenr, len); 2079 btrfs_add_free_space(cache, bytenr, len);
2083 } 2080 }
2084 put_block_group(cache); 2081 btrfs_put_block_group(cache);
2085 bytenr += len; 2082 bytenr += len;
2086 num -= len; 2083 num -= len;
2087 } 2084 }
@@ -2112,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root,
2112 } 2109 }
2113 spin_unlock(&cache->lock); 2110 spin_unlock(&cache->lock);
2114 spin_unlock(&cache->space_info->lock); 2111 spin_unlock(&cache->space_info->lock);
2115 put_block_group(cache); 2112 btrfs_put_block_group(cache);
2116 bytenr += len; 2113 bytenr += len;
2117 num -= len; 2114 num -= len;
2118 } 2115 }
@@ -2127,7 +2124,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2127 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; 2124 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2128 int ret; 2125 int ret;
2129 2126
2130 mutex_lock(&root->fs_info->pinned_mutex);
2131 while (1) { 2127 while (1) {
2132 ret = find_first_extent_bit(pinned_extents, last, 2128 ret = find_first_extent_bit(pinned_extents, last,
2133 &start, &end, EXTENT_DIRTY); 2129 &start, &end, EXTENT_DIRTY);
@@ -2136,7 +2132,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2136 set_extent_dirty(copy, start, end, GFP_NOFS); 2132 set_extent_dirty(copy, start, end, GFP_NOFS);
2137 last = end + 1; 2133 last = end + 1;
2138 } 2134 }
2139 mutex_unlock(&root->fs_info->pinned_mutex);
2140 return 0; 2135 return 0;
2141} 2136}
2142 2137
@@ -2149,7 +2144,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2149 int ret; 2144 int ret;
2150 2145
2151 while (1) { 2146 while (1) {
2152 mutex_lock(&root->fs_info->pinned_mutex);
2153 ret = find_first_extent_bit(unpin, 0, &start, &end, 2147 ret = find_first_extent_bit(unpin, 0, &start, &end,
2154 EXTENT_DIRTY); 2148 EXTENT_DIRTY);
2155 if (ret) 2149 if (ret)
@@ -2163,7 +2157,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2163 2157
2164 cond_resched(); 2158 cond_resched();
2165 } 2159 }
2166 mutex_unlock(&root->fs_info->pinned_mutex);
2167 return ret; 2160 return ret;
2168} 2161}
2169 2162
@@ -2205,7 +2198,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
2205 free_extent_buffer(buf); 2198 free_extent_buffer(buf);
2206pinit: 2199pinit:
2207 btrfs_set_path_blocking(path); 2200 btrfs_set_path_blocking(path);
2208 mutex_lock(&root->fs_info->pinned_mutex);
2209 /* unlocks the pinned mutex */ 2201 /* unlocks the pinned mutex */
2210 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2202 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2211 2203
@@ -2511,8 +2503,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2511 */ 2503 */
2512 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && 2504 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
2513 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 2505 owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2514 mutex_lock(&root->fs_info->pinned_mutex);
2515
2516 /* unlocks the pinned mutex */ 2506 /* unlocks the pinned mutex */
2517 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 2507 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2518 update_reserved_extents(root, bytenr, num_bytes, 0); 2508 update_reserved_extents(root, bytenr, num_bytes, 0);
@@ -2554,228 +2544,237 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2554{ 2544{
2555 int ret = 0; 2545 int ret = 0;
2556 struct btrfs_root *root = orig_root->fs_info->extent_root; 2546 struct btrfs_root *root = orig_root->fs_info->extent_root;
2557 u64 total_needed = num_bytes; 2547 struct btrfs_free_cluster *last_ptr = NULL;
2558 u64 *last_ptr = NULL;
2559 u64 last_wanted = 0;
2560 struct btrfs_block_group_cache *block_group = NULL; 2548 struct btrfs_block_group_cache *block_group = NULL;
2561 int chunk_alloc_done = 0;
2562 int empty_cluster = 2 * 1024 * 1024; 2549 int empty_cluster = 2 * 1024 * 1024;
2563 int allowed_chunk_alloc = 0; 2550 int allowed_chunk_alloc = 0;
2564 struct list_head *head = NULL, *cur = NULL;
2565 int loop = 0;
2566 int extra_loop = 0;
2567 struct btrfs_space_info *space_info; 2551 struct btrfs_space_info *space_info;
2552 int last_ptr_loop = 0;
2553 int loop = 0;
2568 2554
2569 WARN_ON(num_bytes < root->sectorsize); 2555 WARN_ON(num_bytes < root->sectorsize);
2570 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 2556 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2571 ins->objectid = 0; 2557 ins->objectid = 0;
2572 ins->offset = 0; 2558 ins->offset = 0;
2573 2559
2560 space_info = __find_space_info(root->fs_info, data);
2561
2574 if (orig_root->ref_cows || empty_size) 2562 if (orig_root->ref_cows || empty_size)
2575 allowed_chunk_alloc = 1; 2563 allowed_chunk_alloc = 1;
2576 2564
2577 if (data & BTRFS_BLOCK_GROUP_METADATA) { 2565 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2578 last_ptr = &root->fs_info->last_alloc; 2566 last_ptr = &root->fs_info->meta_alloc_cluster;
2579 if (!btrfs_test_opt(root, SSD)) 2567 if (!btrfs_test_opt(root, SSD))
2580 empty_cluster = 64 * 1024; 2568 empty_cluster = 64 * 1024;
2581 } 2569 }
2582 2570
2583 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 2571 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
2584 last_ptr = &root->fs_info->last_data_alloc; 2572 last_ptr = &root->fs_info->data_alloc_cluster;
2573 }
2585 2574
2586 if (last_ptr) { 2575 if (last_ptr) {
2587 if (*last_ptr) { 2576 spin_lock(&last_ptr->lock);
2588 hint_byte = *last_ptr; 2577 if (last_ptr->block_group)
2589 last_wanted = *last_ptr; 2578 hint_byte = last_ptr->window_start;
2590 } else 2579 spin_unlock(&last_ptr->lock);
2591 empty_size += empty_cluster;
2592 } else {
2593 empty_cluster = 0;
2594 } 2580 }
2581
2595 search_start = max(search_start, first_logical_byte(root, 0)); 2582 search_start = max(search_start, first_logical_byte(root, 0));
2596 search_start = max(search_start, hint_byte); 2583 search_start = max(search_start, hint_byte);
2597 2584
2598 if (last_wanted && search_start != last_wanted) { 2585 if (!last_ptr) {
2599 last_wanted = 0; 2586 empty_cluster = 0;
2600 empty_size += empty_cluster; 2587 loop = 1;
2601 } 2588 }
2602 2589
2603 total_needed += empty_size; 2590 if (search_start == hint_byte) {
2604 block_group = btrfs_lookup_block_group(root->fs_info, search_start); 2591 block_group = btrfs_lookup_block_group(root->fs_info,
2605 if (!block_group) 2592 search_start);
2606 block_group = btrfs_lookup_first_block_group(root->fs_info, 2593 if (block_group && block_group_bits(block_group, data)) {
2607 search_start); 2594 down_read(&space_info->groups_sem);
2608 space_info = __find_space_info(root->fs_info, data); 2595 goto have_block_group;
2596 } else if (block_group) {
2597 btrfs_put_block_group(block_group);
2598 }
2599 }
2609 2600
2601search:
2610 down_read(&space_info->groups_sem); 2602 down_read(&space_info->groups_sem);
2611 while (1) { 2603 list_for_each_entry(block_group, &space_info->block_groups, list) {
2612 struct btrfs_free_space *free_space; 2604 u64 offset;
2613 /*
2614 * the only way this happens if our hint points to a block
2615 * group thats not of the proper type, while looping this
2616 * should never happen
2617 */
2618 if (empty_size)
2619 extra_loop = 1;
2620 2605
2621 if (!block_group) 2606 atomic_inc(&block_group->count);
2622 goto new_group_no_lock; 2607 search_start = block_group->key.objectid;
2623 2608
2609have_block_group:
2624 if (unlikely(!block_group->cached)) { 2610 if (unlikely(!block_group->cached)) {
2625 mutex_lock(&block_group->cache_mutex); 2611 mutex_lock(&block_group->cache_mutex);
2626 ret = cache_block_group(root, block_group); 2612 ret = cache_block_group(root, block_group);
2627 mutex_unlock(&block_group->cache_mutex); 2613 mutex_unlock(&block_group->cache_mutex);
2628 if (ret) 2614 if (ret) {
2615 btrfs_put_block_group(block_group);
2629 break; 2616 break;
2617 }
2630 } 2618 }
2631 2619
2632 mutex_lock(&block_group->alloc_mutex);
2633 if (unlikely(!block_group_bits(block_group, data)))
2634 goto new_group;
2635
2636 if (unlikely(block_group->ro)) 2620 if (unlikely(block_group->ro))
2637 goto new_group; 2621 goto loop;
2638 2622
2639 free_space = btrfs_find_free_space(block_group, search_start, 2623 if (last_ptr) {
2640 total_needed); 2624 /*
2641 if (free_space) { 2625 * the refill lock keeps out other
2642 u64 start = block_group->key.objectid; 2626 * people trying to start a new cluster
2643 u64 end = block_group->key.objectid + 2627 */
2644 block_group->key.offset; 2628 spin_lock(&last_ptr->refill_lock);
2629 offset = btrfs_alloc_from_cluster(block_group, last_ptr,
2630 num_bytes, search_start);
2631 if (offset) {
2632 /* we have a block, we're done */
2633 spin_unlock(&last_ptr->refill_lock);
2634 goto checks;
2635 }
2645 2636
2646 search_start = stripe_align(root, free_space->offset); 2637 spin_lock(&last_ptr->lock);
2638 /*
2639 * whoops, this cluster doesn't actually point to
2640 * this block group. Get a ref on the block
2641 * group is does point to and try again
2642 */
2643 if (!last_ptr_loop && last_ptr->block_group &&
2644 last_ptr->block_group != block_group) {
2645
2646 btrfs_put_block_group(block_group);
2647 block_group = last_ptr->block_group;
2648 atomic_inc(&block_group->count);
2649 spin_unlock(&last_ptr->lock);
2650 spin_unlock(&last_ptr->refill_lock);
2651
2652 last_ptr_loop = 1;
2653 search_start = block_group->key.objectid;
2654 goto have_block_group;
2655 }
2656 spin_unlock(&last_ptr->lock);
2647 2657
2648 /* move on to the next group */ 2658 /*
2649 if (search_start + num_bytes >= search_end) 2659 * this cluster didn't work out, free it and
2650 goto new_group; 2660 * start over
2661 */
2662 btrfs_return_cluster_to_free_space(NULL, last_ptr);
2651 2663
2652 /* move on to the next group */ 2664 last_ptr_loop = 0;
2653 if (search_start + num_bytes > end)
2654 goto new_group;
2655 2665
2656 if (last_wanted && search_start != last_wanted) { 2666 /* allocate a cluster in this block group */
2657 total_needed += empty_cluster; 2667 ret = btrfs_find_space_cluster(trans,
2658 empty_size += empty_cluster; 2668 block_group, last_ptr,
2659 last_wanted = 0; 2669 offset, num_bytes,
2670 empty_cluster + empty_size);
2671 if (ret == 0) {
2660 /* 2672 /*
2661 * if search_start is still in this block group 2673 * now pull our allocation out of this
2662 * then we just re-search this block group 2674 * cluster
2663 */ 2675 */
2664 if (search_start >= start && 2676 offset = btrfs_alloc_from_cluster(block_group,
2665 search_start < end) { 2677 last_ptr, num_bytes,
2666 mutex_unlock(&block_group->alloc_mutex); 2678 search_start);
2667 continue; 2679 if (offset) {
2680 /* we found one, proceed */
2681 spin_unlock(&last_ptr->refill_lock);
2682 goto checks;
2668 } 2683 }
2669
2670 /* else we go to the next block group */
2671 goto new_group;
2672 } 2684 }
2673 2685 /*
2674 if (exclude_nr > 0 && 2686 * at this point we either didn't find a cluster
2675 (search_start + num_bytes > exclude_start && 2687 * or we weren't able to allocate a block from our
2676 search_start < exclude_start + exclude_nr)) { 2688 * cluster. Free the cluster we've been trying
2677 search_start = exclude_start + exclude_nr; 2689 * to use, and go to the next block group
2678 /* 2690 */
2679 * if search_start is still in this block group 2691 if (loop < 2) {
2680 * then we just re-search this block group 2692 btrfs_return_cluster_to_free_space(NULL,
2681 */ 2693 last_ptr);
2682 if (search_start >= start && 2694 spin_unlock(&last_ptr->refill_lock);
2683 search_start < end) { 2695 goto loop;
2684 mutex_unlock(&block_group->alloc_mutex);
2685 last_wanted = 0;
2686 continue;
2687 }
2688
2689 /* else we go to the next block group */
2690 goto new_group;
2691 } 2696 }
2697 spin_unlock(&last_ptr->refill_lock);
2698 }
2692 2699
2693 ins->objectid = search_start; 2700 offset = btrfs_find_space_for_alloc(block_group, search_start,
2694 ins->offset = num_bytes; 2701 num_bytes, empty_size);
2702 if (!offset)
2703 goto loop;
2704checks:
2705 search_start = stripe_align(root, offset);
2706
2707 /* move on to the next group */
2708 if (search_start + num_bytes >= search_end) {
2709 btrfs_add_free_space(block_group, offset, num_bytes);
2710 goto loop;
2711 }
2695 2712
2696 btrfs_remove_free_space_lock(block_group, search_start, 2713 /* move on to the next group */
2697 num_bytes); 2714 if (search_start + num_bytes >
2698 /* we are all good, lets return */ 2715 block_group->key.objectid + block_group->key.offset) {
2699 mutex_unlock(&block_group->alloc_mutex); 2716 btrfs_add_free_space(block_group, offset, num_bytes);
2700 break; 2717 goto loop;
2701 } 2718 }
2702new_group:
2703 mutex_unlock(&block_group->alloc_mutex);
2704 put_block_group(block_group);
2705 block_group = NULL;
2706new_group_no_lock:
2707 /* don't try to compare new allocations against the
2708 * last allocation any more
2709 */
2710 last_wanted = 0;
2711 2719
2712 /* 2720 if (exclude_nr > 0 &&
2713 * Here's how this works. 2721 (search_start + num_bytes > exclude_start &&
2714 * loop == 0: we were searching a block group via a hint 2722 search_start < exclude_start + exclude_nr)) {
2715 * and didn't find anything, so we start at 2723 search_start = exclude_start + exclude_nr;
2716 * the head of the block groups and keep searching 2724
2717 * loop == 1: we're searching through all of the block groups 2725 btrfs_add_free_space(block_group, offset, num_bytes);
2718 * if we hit the head again we have searched 2726 /*
2719 * all of the block groups for this space and we 2727 * if search_start is still in this block group
2720 * need to try and allocate, if we cant error out. 2728 * then we just re-search this block group
2721 * loop == 2: we allocated more space and are looping through
2722 * all of the block groups again.
2723 */
2724 if (loop == 0) {
2725 head = &space_info->block_groups;
2726 cur = head->next;
2727 loop++;
2728 } else if (loop == 1 && cur == head) {
2729 int keep_going;
2730
2731 /* at this point we give up on the empty_size
2732 * allocations and just try to allocate the min
2733 * space.
2734 *
2735 * The extra_loop field was set if an empty_size
2736 * allocation was attempted above, and if this
2737 * is try we need to try the loop again without
2738 * the additional empty_size.
2739 */ 2729 */
2740 total_needed -= empty_size; 2730 if (search_start >= block_group->key.objectid &&
2741 empty_size = 0; 2731 search_start < (block_group->key.objectid +
2742 keep_going = extra_loop; 2732 block_group->key.offset))
2743 loop++; 2733 goto have_block_group;
2734 goto loop;
2735 }
2744 2736
2745 if (allowed_chunk_alloc && !chunk_alloc_done) { 2737 ins->objectid = search_start;
2746 up_read(&space_info->groups_sem); 2738 ins->offset = num_bytes;
2747 ret = do_chunk_alloc(trans, root, num_bytes + 2739
2748 2 * 1024 * 1024, data, 1); 2740 if (offset < search_start)
2749 down_read(&space_info->groups_sem); 2741 btrfs_add_free_space(block_group, offset,
2750 if (ret < 0) 2742 search_start - offset);
2751 goto loop_check; 2743 BUG_ON(offset > search_start);
2752 head = &space_info->block_groups; 2744
2753 /* 2745 /* we are all good, lets return */
2754 * we've allocated a new chunk, keep 2746 break;
2755 * trying 2747loop:
2756 */ 2748 btrfs_put_block_group(block_group);
2757 keep_going = 1; 2749 }
2758 chunk_alloc_done = 1; 2750 up_read(&space_info->groups_sem);
2759 } else if (!allowed_chunk_alloc) { 2751
2760 space_info->force_alloc = 1; 2752 /* loop == 0, try to find a clustered alloc in every block group
2761 } 2753 * loop == 1, try again after forcing a chunk allocation
2762loop_check: 2754 * loop == 2, set empty_size and empty_cluster to 0 and try again
2763 if (keep_going) { 2755 */
2764 cur = head->next; 2756 if (!ins->objectid && loop < 3 &&
2765 extra_loop = 0; 2757 (empty_size || empty_cluster || allowed_chunk_alloc)) {
2766 } else { 2758 if (loop >= 2) {
2767 break; 2759 empty_size = 0;
2768 } 2760 empty_cluster = 0;
2769 } else if (cur == head) {
2770 break;
2771 } 2761 }
2772 2762
2773 block_group = list_entry(cur, struct btrfs_block_group_cache, 2763 if (allowed_chunk_alloc) {
2774 list); 2764 ret = do_chunk_alloc(trans, root, num_bytes +
2775 atomic_inc(&block_group->count); 2765 2 * 1024 * 1024, data, 1);
2766 allowed_chunk_alloc = 0;
2767 } else {
2768 space_info->force_alloc = 1;
2769 }
2776 2770
2777 search_start = block_group->key.objectid; 2771 if (loop < 3) {
2778 cur = cur->next; 2772 loop++;
2773 goto search;
2774 }
2775 ret = -ENOSPC;
2776 } else if (!ins->objectid) {
2777 ret = -ENOSPC;
2779 } 2778 }
2780 2779
2781 /* we found what we needed */ 2780 /* we found what we needed */
@@ -2783,21 +2782,10 @@ loop_check:
2783 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 2782 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2784 trans->block_group = block_group->key.objectid; 2783 trans->block_group = block_group->key.objectid;
2785 2784
2786 if (last_ptr) 2785 btrfs_put_block_group(block_group);
2787 *last_ptr = ins->objectid + ins->offset;
2788 ret = 0; 2786 ret = 0;
2789 } else if (!ret) {
2790 printk(KERN_ERR "btrfs searching for %llu bytes, "
2791 "num_bytes %llu, loop %d, allowed_alloc %d\n",
2792 (unsigned long long)total_needed,
2793 (unsigned long long)num_bytes,
2794 loop, allowed_chunk_alloc);
2795 ret = -ENOSPC;
2796 } 2787 }
2797 if (block_group)
2798 put_block_group(block_group);
2799 2788
2800 up_read(&space_info->groups_sem);
2801 return ret; 2789 return ret;
2802} 2790}
2803 2791
@@ -2902,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
2902 ret = btrfs_discard_extent(root, start, len); 2890 ret = btrfs_discard_extent(root, start, len);
2903 2891
2904 btrfs_add_free_space(cache, start, len); 2892 btrfs_add_free_space(cache, start, len);
2905 put_block_group(cache); 2893 btrfs_put_block_group(cache);
2906 update_reserved_extents(root, start, len, 0); 2894 update_reserved_extents(root, start, len, 0);
2907 2895
2908 return ret; 2896 return ret;
@@ -3040,7 +3028,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3040 ret = btrfs_remove_free_space(block_group, ins->objectid, 3028 ret = btrfs_remove_free_space(block_group, ins->objectid,
3041 ins->offset); 3029 ins->offset);
3042 BUG_ON(ret); 3030 BUG_ON(ret);
3043 put_block_group(block_group); 3031 btrfs_put_block_group(block_group);
3044 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, 3032 ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3045 ref_generation, owner, ins, 1); 3033 ref_generation, owner, ins, 1);
3046 return ret; 3034 return ret;
@@ -5729,7 +5717,7 @@ next:
5729 WARN_ON(block_group->reserved > 0); 5717 WARN_ON(block_group->reserved > 0);
5730 WARN_ON(btrfs_block_group_used(&block_group->item) > 0); 5718 WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
5731 spin_unlock(&block_group->lock); 5719 spin_unlock(&block_group->lock);
5732 put_block_group(block_group); 5720 btrfs_put_block_group(block_group);
5733 ret = 0; 5721 ret = 0;
5734out: 5722out:
5735 btrfs_free_path(path); 5723 btrfs_free_path(path);
@@ -5856,9 +5844,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
5856 5844
5857 atomic_set(&cache->count, 1); 5845 atomic_set(&cache->count, 1);
5858 spin_lock_init(&cache->lock); 5846 spin_lock_init(&cache->lock);
5859 mutex_init(&cache->alloc_mutex); 5847 spin_lock_init(&cache->tree_lock);
5860 mutex_init(&cache->cache_mutex); 5848 mutex_init(&cache->cache_mutex);
5861 INIT_LIST_HEAD(&cache->list); 5849 INIT_LIST_HEAD(&cache->list);
5850 INIT_LIST_HEAD(&cache->cluster_list);
5862 read_extent_buffer(leaf, &cache->item, 5851 read_extent_buffer(leaf, &cache->item,
5863 btrfs_item_ptr_offset(leaf, path->slots[0]), 5852 btrfs_item_ptr_offset(leaf, path->slots[0]),
5864 sizeof(cache->item)); 5853 sizeof(cache->item));
@@ -5912,9 +5901,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5912 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 5901 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
5913 atomic_set(&cache->count, 1); 5902 atomic_set(&cache->count, 1);
5914 spin_lock_init(&cache->lock); 5903 spin_lock_init(&cache->lock);
5915 mutex_init(&cache->alloc_mutex); 5904 spin_lock_init(&cache->tree_lock);
5916 mutex_init(&cache->cache_mutex); 5905 mutex_init(&cache->cache_mutex);
5917 INIT_LIST_HEAD(&cache->list); 5906 INIT_LIST_HEAD(&cache->list);
5907 INIT_LIST_HEAD(&cache->cluster_list);
5918 5908
5919 btrfs_set_block_group_used(&cache->item, bytes_used); 5909 btrfs_set_block_group_used(&cache->item, bytes_used);
5920 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 5910 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -5974,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5974 spin_unlock(&block_group->space_info->lock); 5964 spin_unlock(&block_group->space_info->lock);
5975 block_group->space_info->full = 0; 5965 block_group->space_info->full = 0;
5976 5966
5977 put_block_group(block_group); 5967 btrfs_put_block_group(block_group);
5978 put_block_group(block_group); 5968 btrfs_put_block_group(block_group);
5979 5969
5980 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 5970 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
5981 if (ret > 0) 5971 if (ret > 0)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 08085af089e2..eb2bee8b7fbf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2884,25 +2884,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2884 disko = 0; 2884 disko = 0;
2885 flags = 0; 2885 flags = 0;
2886 2886
2887 switch (em->block_start) { 2887 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2888 case EXTENT_MAP_LAST_BYTE:
2889 end = 1; 2888 end = 1;
2890 flags |= FIEMAP_EXTENT_LAST; 2889 flags |= FIEMAP_EXTENT_LAST;
2891 break; 2890 } else if (em->block_start == EXTENT_MAP_HOLE) {
2892 case EXTENT_MAP_HOLE:
2893 flags |= FIEMAP_EXTENT_UNWRITTEN; 2891 flags |= FIEMAP_EXTENT_UNWRITTEN;
2894 break; 2892 } else if (em->block_start == EXTENT_MAP_INLINE) {
2895 case EXTENT_MAP_INLINE:
2896 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2893 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2897 FIEMAP_EXTENT_NOT_ALIGNED); 2894 FIEMAP_EXTENT_NOT_ALIGNED);
2898 break; 2895 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
2899 case EXTENT_MAP_DELALLOC:
2900 flags |= (FIEMAP_EXTENT_DELALLOC | 2896 flags |= (FIEMAP_EXTENT_DELALLOC |
2901 FIEMAP_EXTENT_UNKNOWN); 2897 FIEMAP_EXTENT_UNKNOWN);
2902 break; 2898 } else {
2903 default:
2904 disko = em->block_start; 2899 disko = em->block_start;
2905 break;
2906 } 2900 }
2907 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2901 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2908 flags |= FIEMAP_EXTENT_ENCODED; 2902 flags |= FIEMAP_EXTENT_ENCODED;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 50da69da20ce..b187917b36fa 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -234,7 +234,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
234 rb = tree_insert(&tree->map, em->start, &em->rb_node); 234 rb = tree_insert(&tree->map, em->start, &em->rb_node);
235 if (rb) { 235 if (rb) {
236 ret = -EEXIST; 236 ret = -EEXIST;
237 free_extent_map(merge);
238 goto out; 237 goto out;
239 } 238 }
240 atomic_inc(&em->refs); 239 atomic_inc(&em->refs);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d1e5f0e84c58..768b9523662d 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include "ctree.h" 20#include "ctree.h"
21#include "free-space-cache.h"
22#include "transaction.h"
23
24struct btrfs_free_space {
25 struct rb_node bytes_index;
26 struct rb_node offset_index;
27 u64 offset;
28 u64 bytes;
29};
21 30
22static int tree_insert_offset(struct rb_root *root, u64 offset, 31static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node) 32 struct rb_node *node)
@@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
68} 77}
69 78
70/* 79/*
71 * searches the tree for the given offset. If contains is set we will return 80 * searches the tree for the given offset.
72 * the free space that contains the given offset. If contains is not set we 81 *
73 * will return the free space that starts at or after the given offset and is 82 * fuzzy == 1: this is used for allocations where we are given a hint of where
74 * at least bytes long. 83 * to look for free space. Because the hint may not be completely on an offset
84 * mark, or the hint may no longer point to free space we need to fudge our
85 * results a bit. So we look for free space starting at or after offset with at
86 * least bytes size. We prefer to find as close to the given offset as we can.
87 * Also if the offset is within a free space range, then we will return the free
88 * space that contains the given offset, which means we can return a free space
89 * chunk with an offset before the provided offset.
90 *
91 * fuzzy == 0: this is just a normal tree search. Give us the free space that
92 * starts at the given offset which is at least bytes size, and if its not there
93 * return NULL.
75 */ 94 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 95static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes, 96 u64 offset, u64 bytes,
78 int contains) 97 int fuzzy)
79{ 98{
80 struct rb_node *n = root->rb_node; 99 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL; 100 struct btrfs_free_space *entry, *ret = NULL;
@@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
84 entry = rb_entry(n, struct btrfs_free_space, offset_index); 103 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85 104
86 if (offset < entry->offset) { 105 if (offset < entry->offset) {
87 if (!contains && 106 if (fuzzy &&
88 (!ret || entry->offset < ret->offset) && 107 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes)) 108 (bytes <= entry->bytes))
90 ret = entry; 109 ret = entry;
91 n = n->rb_left; 110 n = n->rb_left;
92 } else if (offset > entry->offset) { 111 } else if (offset > entry->offset) {
93 if ((entry->offset + entry->bytes - 1) >= offset && 112 if (fuzzy &&
113 (entry->offset + entry->bytes - 1) >= offset &&
94 bytes <= entry->bytes) { 114 bytes <= entry->bytes) {
95 ret = entry; 115 ret = entry;
96 break; 116 break;
@@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
171 int ret = 0; 191 int ret = 0;
172 192
173 193
194 BUG_ON(!info->bytes);
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 195 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index); 196 &info->offset_index);
176 if (ret) 197 if (ret)
@@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
184 return ret; 205 return ret;
185} 206}
186 207
187static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 208int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes) 209 u64 offset, u64 bytes)
189{ 210{
190 struct btrfs_free_space *right_info; 211 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info; 212 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL; 213 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0; 214 int ret = 0;
195 215
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 216 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info) 217 if (!info)
198 return -ENOMEM; 218 return -ENOMEM;
199 219
220 info->offset = offset;
221 info->bytes = bytes;
222
223 spin_lock(&block_group->tree_lock);
224
200 /* 225 /*
201 * first we want to see if there is free space adjacent to the range we 226 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to 227 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range 228 * cover the entire range
204 */ 229 */
205 right_info = tree_search_offset(&block_group->free_space_offset, 230 right_info = tree_search_offset(&block_group->free_space_offset,
206 offset+bytes, 0, 1); 231 offset+bytes, 0, 0);
207 left_info = tree_search_offset(&block_group->free_space_offset, 232 left_info = tree_search_offset(&block_group->free_space_offset,
208 offset-1, 0, 1); 233 offset-1, 0, 1);
209 234
210 if (right_info && right_info->offset == offset+bytes) { 235 if (right_info) {
211 unlink_free_space(block_group, right_info); 236 unlink_free_space(block_group, right_info);
212 info = right_info; 237 info->bytes += right_info->bytes;
213 info->offset = offset; 238 kfree(right_info);
214 info->bytes += bytes;
215 } else if (right_info && right_info->offset != offset+bytes) {
216 printk(KERN_ERR "btrfs adding space in the middle of an "
217 "existing free space area. existing: "
218 "offset=%llu, bytes=%llu. new: offset=%llu, "
219 "bytes=%llu\n", (unsigned long long)right_info->offset,
220 (unsigned long long)right_info->bytes,
221 (unsigned long long)offset,
222 (unsigned long long)bytes);
223 BUG();
224 } 239 }
225 240
226 if (left_info) { 241 if (left_info && left_info->offset + left_info->bytes == offset) {
227 unlink_free_space(block_group, left_info); 242 unlink_free_space(block_group, left_info);
228 243 info->offset = left_info->offset;
229 if (unlikely((left_info->offset + left_info->bytes) != 244 info->bytes += left_info->bytes;
230 offset)) { 245 kfree(left_info);
231 printk(KERN_ERR "btrfs free space to the left "
232 "of new free space isn't "
233 "quite right. existing: offset=%llu, "
234 "bytes=%llu. new: offset=%llu, bytes=%llu\n",
235 (unsigned long long)left_info->offset,
236 (unsigned long long)left_info->bytes,
237 (unsigned long long)offset,
238 (unsigned long long)bytes);
239 BUG();
240 }
241
242 if (info) {
243 info->offset = left_info->offset;
244 info->bytes += left_info->bytes;
245 kfree(left_info);
246 } else {
247 info = left_info;
248 info->bytes += bytes;
249 }
250 } 246 }
251 247
252 if (info) {
253 ret = link_free_space(block_group, info);
254 if (!ret)
255 info = NULL;
256 goto out;
257 }
258
259 info = alloc_info;
260 alloc_info = NULL;
261 info->offset = offset;
262 info->bytes = bytes;
263
264 ret = link_free_space(block_group, info); 248 ret = link_free_space(block_group, info);
265 if (ret) 249 if (ret)
266 kfree(info); 250 kfree(info);
267out: 251
252 spin_unlock(&block_group->tree_lock);
253
268 if (ret) { 254 if (ret) {
269 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 255 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
270 if (ret == -EEXIST) 256 BUG_ON(ret == -EEXIST);
271 BUG();
272 } 257 }
273 258
274 kfree(alloc_info);
275
276 return ret; 259 return ret;
277} 260}
278 261
279static int 262int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
280__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 263 u64 offset, u64 bytes)
281 u64 offset, u64 bytes)
282{ 264{
283 struct btrfs_free_space *info; 265 struct btrfs_free_space *info;
284 int ret = 0; 266 int ret = 0;
285 267
268 spin_lock(&block_group->tree_lock);
269
286 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 270 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
287 1); 271 1);
288
289 if (info && info->offset == offset) { 272 if (info && info->offset == offset) {
290 if (info->bytes < bytes) { 273 if (info->bytes < bytes) {
291 printk(KERN_ERR "Found free space at %llu, size %llu," 274 printk(KERN_ERR "Found free space at %llu, size %llu,"
@@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
295 (unsigned long long)bytes); 278 (unsigned long long)bytes);
296 WARN_ON(1); 279 WARN_ON(1);
297 ret = -EINVAL; 280 ret = -EINVAL;
281 spin_unlock(&block_group->tree_lock);
298 goto out; 282 goto out;
299 } 283 }
300 unlink_free_space(block_group, info); 284 unlink_free_space(block_group, info);
301 285
302 if (info->bytes == bytes) { 286 if (info->bytes == bytes) {
303 kfree(info); 287 kfree(info);
288 spin_unlock(&block_group->tree_lock);
304 goto out; 289 goto out;
305 } 290 }
306 291
@@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
308 info->bytes -= bytes; 293 info->bytes -= bytes;
309 294
310 ret = link_free_space(block_group, info); 295 ret = link_free_space(block_group, info);
296 spin_unlock(&block_group->tree_lock);
311 BUG_ON(ret); 297 BUG_ON(ret);
312 } else if (info && info->offset < offset && 298 } else if (info && info->offset < offset &&
313 info->offset + info->bytes >= offset + bytes) { 299 info->offset + info->bytes >= offset + bytes) {
@@ -333,70 +319,33 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
333 */ 319 */
334 kfree(info); 320 kfree(info);
335 } 321 }
336 322 spin_unlock(&block_group->tree_lock);
337 /* step two, insert a new info struct to cover anything 323 /* step two, insert a new info struct to cover anything
338 * before the hole 324 * before the hole
339 */ 325 */
340 ret = __btrfs_add_free_space(block_group, old_start, 326 ret = btrfs_add_free_space(block_group, old_start,
341 offset - old_start); 327 offset - old_start);
342 BUG_ON(ret); 328 BUG_ON(ret);
343 } else { 329 } else {
330 spin_unlock(&block_group->tree_lock);
331 if (!info) {
332 printk(KERN_ERR "couldn't find space %llu to free\n",
333 (unsigned long long)offset);
334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
335 block_group->cached, block_group->key.objectid,
336 block_group->key.offset);
337 btrfs_dump_free_space(block_group, bytes);
338 } else if (info) {
339 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
340 "but wanted offset=%llu bytes=%llu\n",
341 info->offset, info->bytes, offset, bytes);
342 }
344 WARN_ON(1); 343 WARN_ON(1);
345 } 344 }
346out: 345out:
347 return ret; 346 return ret;
348} 347}
349 348
350int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
351 u64 offset, u64 bytes)
352{
353 int ret;
354 struct btrfs_free_space *sp;
355
356 mutex_lock(&block_group->alloc_mutex);
357 ret = __btrfs_add_free_space(block_group, offset, bytes);
358 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
359 BUG_ON(!sp);
360 mutex_unlock(&block_group->alloc_mutex);
361
362 return ret;
363}
364
365int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
366 u64 offset, u64 bytes)
367{
368 int ret;
369 struct btrfs_free_space *sp;
370
371 ret = __btrfs_add_free_space(block_group, offset, bytes);
372 sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
373 BUG_ON(!sp);
374
375 return ret;
376}
377
378int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
379 u64 offset, u64 bytes)
380{
381 int ret = 0;
382
383 mutex_lock(&block_group->alloc_mutex);
384 ret = __btrfs_remove_free_space(block_group, offset, bytes);
385 mutex_unlock(&block_group->alloc_mutex);
386
387 return ret;
388}
389
390int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
391 u64 offset, u64 bytes)
392{
393 int ret;
394
395 ret = __btrfs_remove_free_space(block_group, offset, bytes);
396
397 return ret;
398}
399
400void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 349void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
401 u64 bytes) 350 u64 bytes)
402{ 351{
@@ -408,6 +357,8 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
408 info = rb_entry(n, struct btrfs_free_space, offset_index); 357 info = rb_entry(n, struct btrfs_free_space, offset_index);
409 if (info->bytes >= bytes) 358 if (info->bytes >= bytes)
410 count++; 359 count++;
360 printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset,
361 info->bytes);
411 } 362 }
412 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 363 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
413 "\n", count); 364 "\n", count);
@@ -428,68 +379,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
428 return ret; 379 return ret;
429} 380}
430 381
382/*
383 * for a given cluster, put all of its extents back into the free
384 * space cache. If the block group passed doesn't match the block group
385 * pointed to by the cluster, someone else raced in and freed the
386 * cluster already. In that case, we just return without changing anything
387 */
388static int
389__btrfs_return_cluster_to_free_space(
390 struct btrfs_block_group_cache *block_group,
391 struct btrfs_free_cluster *cluster)
392{
393 struct btrfs_free_space *entry;
394 struct rb_node *node;
395
396 spin_lock(&cluster->lock);
397 if (cluster->block_group != block_group)
398 goto out;
399
400 cluster->window_start = 0;
401 node = rb_first(&cluster->root);
402 while(node) {
403 entry = rb_entry(node, struct btrfs_free_space, offset_index);
404 node = rb_next(&entry->offset_index);
405 rb_erase(&entry->offset_index, &cluster->root);
406 link_free_space(block_group, entry);
407 }
408 list_del_init(&cluster->block_group_list);
409
410 btrfs_put_block_group(cluster->block_group);
411 cluster->block_group = NULL;
412 cluster->root.rb_node = NULL;
413out:
414 spin_unlock(&cluster->lock);
415 return 0;
416}
417
431void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 418void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
432{ 419{
433 struct btrfs_free_space *info; 420 struct btrfs_free_space *info;
434 struct rb_node *node; 421 struct rb_node *node;
422 struct btrfs_free_cluster *cluster;
423 struct btrfs_free_cluster *safe;
424
425 spin_lock(&block_group->tree_lock);
426
427 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
428 block_group_list) {
429
430 WARN_ON(cluster->block_group != block_group);
431 __btrfs_return_cluster_to_free_space(block_group, cluster);
432 }
435 433
436 mutex_lock(&block_group->alloc_mutex);
437 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 434 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
438 info = rb_entry(node, struct btrfs_free_space, bytes_index); 435 info = rb_entry(node, struct btrfs_free_space, bytes_index);
439 unlink_free_space(block_group, info); 436 unlink_free_space(block_group, info);
440 kfree(info); 437 kfree(info);
441 if (need_resched()) { 438 if (need_resched()) {
442 mutex_unlock(&block_group->alloc_mutex); 439 spin_unlock(&block_group->tree_lock);
443 cond_resched(); 440 cond_resched();
444 mutex_lock(&block_group->alloc_mutex); 441 spin_lock(&block_group->tree_lock);
445 } 442 }
446 } 443 }
447 mutex_unlock(&block_group->alloc_mutex); 444 spin_unlock(&block_group->tree_lock);
448} 445}
449 446
450#if 0 447u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
451static struct btrfs_free_space *btrfs_find_free_space_offset(struct 448 u64 offset, u64 bytes, u64 empty_size)
452 btrfs_block_group_cache
453 *block_group, u64 offset,
454 u64 bytes)
455{ 449{
456 struct btrfs_free_space *ret; 450 struct btrfs_free_space *entry = NULL;
451 u64 ret = 0;
457 452
458 mutex_lock(&block_group->alloc_mutex); 453 spin_lock(&block_group->tree_lock);
459 ret = tree_search_offset(&block_group->free_space_offset, offset, 454 entry = tree_search_offset(&block_group->free_space_offset, offset,
460 bytes, 0); 455 bytes + empty_size, 1);
461 mutex_unlock(&block_group->alloc_mutex); 456 if (!entry)
457 entry = tree_search_bytes(&block_group->free_space_bytes,
458 offset, bytes + empty_size);
459 if (entry) {
460 unlink_free_space(block_group, entry);
461 ret = entry->offset;
462 entry->offset += bytes;
463 entry->bytes -= bytes;
464
465 if (!entry->bytes)
466 kfree(entry);
467 else
468 link_free_space(block_group, entry);
469 }
470 spin_unlock(&block_group->tree_lock);
462 471
463 return ret; 472 return ret;
464} 473}
465 474
466static struct btrfs_free_space *btrfs_find_free_space_bytes(struct 475/*
467 btrfs_block_group_cache 476 * given a cluster, put all of its extents back into the free space
468 *block_group, u64 offset, 477 * cache. If a block group is passed, this function will only free
469 u64 bytes) 478 * a cluster that belongs to the passed block group.
479 *
480 * Otherwise, it'll get a reference on the block group pointed to by the
481 * cluster and remove the cluster from it.
482 */
483int btrfs_return_cluster_to_free_space(
484 struct btrfs_block_group_cache *block_group,
485 struct btrfs_free_cluster *cluster)
470{ 486{
471 struct btrfs_free_space *ret; 487 int ret;
472 488
473 mutex_lock(&block_group->alloc_mutex); 489 /* first, get a safe pointer to the block group */
490 spin_lock(&cluster->lock);
491 if (!block_group) {
492 block_group = cluster->block_group;
493 if (!block_group) {
494 spin_unlock(&cluster->lock);
495 return 0;
496 }
497 } else if (cluster->block_group != block_group) {
498 /* someone else has already freed it don't redo their work */
499 spin_unlock(&cluster->lock);
500 return 0;
501 }
502 atomic_inc(&block_group->count);
503 spin_unlock(&cluster->lock);
474 504
475 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); 505 /* now return any extents the cluster had on it */
476 mutex_unlock(&block_group->alloc_mutex); 506 spin_lock(&block_group->tree_lock);
507 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
508 spin_unlock(&block_group->tree_lock);
477 509
510 /* finally drop our ref */
511 btrfs_put_block_group(block_group);
478 return ret; 512 return ret;
479} 513}
480#endif
481 514
482struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache 515/*
483 *block_group, u64 offset, 516 * given a cluster, try to allocate 'bytes' from it, returns 0
484 u64 bytes) 517 * if it couldn't find anything suitably large, or a logical disk offset
518 * if things worked out
519 */
520u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
521 struct btrfs_free_cluster *cluster, u64 bytes,
522 u64 min_start)
523{
524 struct btrfs_free_space *entry = NULL;
525 struct rb_node *node;
526 u64 ret = 0;
527
528 spin_lock(&cluster->lock);
529 if (bytes > cluster->max_size)
530 goto out;
531
532 if (cluster->block_group != block_group)
533 goto out;
534
535 node = rb_first(&cluster->root);
536 if (!node)
537 goto out;
538
539 entry = rb_entry(node, struct btrfs_free_space, offset_index);
540
541 while(1) {
542 if (entry->bytes < bytes || entry->offset < min_start) {
543 struct rb_node *node;
544
545 node = rb_next(&entry->offset_index);
546 if (!node)
547 break;
548 entry = rb_entry(node, struct btrfs_free_space,
549 offset_index);
550 continue;
551 }
552 ret = entry->offset;
553
554 entry->offset += bytes;
555 entry->bytes -= bytes;
556
557 if (entry->bytes == 0) {
558 rb_erase(&entry->offset_index, &cluster->root);
559 kfree(entry);
560 }
561 break;
562 }
563out:
564 spin_unlock(&cluster->lock);
565 return ret;
566}
567
568/*
569 * here we try to find a cluster of blocks in a block group. The goal
570 * is to find at least bytes free and up to empty_size + bytes free.
571 * We might not find them all in one contiguous area.
572 *
573 * returns zero and sets up cluster if things worked out, otherwise
574 * it returns -enospc
575 */
576int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
577 struct btrfs_block_group_cache *block_group,
578 struct btrfs_free_cluster *cluster,
579 u64 offset, u64 bytes, u64 empty_size)
485{ 580{
486 struct btrfs_free_space *ret = NULL; 581 struct btrfs_free_space *entry = NULL;
582 struct rb_node *node;
583 struct btrfs_free_space *next;
584 struct btrfs_free_space *last;
585 u64 min_bytes;
586 u64 window_start;
587 u64 window_free;
588 u64 max_extent = 0;
589 int total_retries = 0;
590 int ret;
591
592 /* for metadata, allow allocates with more holes */
593 if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
594 /*
595 * we want to do larger allocations when we are
596 * flushing out the delayed refs, it helps prevent
597 * making more work as we go along.
598 */
599 if (trans->transaction->delayed_refs.flushing)
600 min_bytes = max(bytes, (bytes + empty_size) >> 1);
601 else
602 min_bytes = max(bytes, (bytes + empty_size) >> 4);
603 } else
604 min_bytes = max(bytes, (bytes + empty_size) >> 2);
605
606 spin_lock(&block_group->tree_lock);
607 spin_lock(&cluster->lock);
608
609 /* someone already found a cluster, hooray */
610 if (cluster->block_group) {
611 ret = 0;
612 goto out;
613 }
614again:
615 min_bytes = min(min_bytes, bytes + empty_size);
616 entry = tree_search_bytes(&block_group->free_space_bytes,
617 offset, min_bytes);
618 if (!entry) {
619 ret = -ENOSPC;
620 goto out;
621 }
622 window_start = entry->offset;
623 window_free = entry->bytes;
624 last = entry;
625 max_extent = entry->bytes;
626
627 while(1) {
628 /* out window is just right, lets fill it */
629 if (window_free >= bytes + empty_size)
630 break;
487 631
488 ret = tree_search_offset(&block_group->free_space_offset, offset, 632 node = rb_next(&last->offset_index);
489 bytes, 0); 633 if (!node) {
490 if (!ret) 634 ret = -ENOSPC;
491 ret = tree_search_bytes(&block_group->free_space_bytes, 635 goto out;
492 offset, bytes); 636 }
637 next = rb_entry(node, struct btrfs_free_space, offset_index);
638
639 /*
640 * we haven't filled the empty size and the window is
641 * very large. reset and try again
642 */
643 if (next->offset - window_start > (bytes + empty_size) * 2) {
644 entry = next;
645 window_start = entry->offset;
646 window_free = entry->bytes;
647 last = entry;
648 max_extent = 0;
649 total_retries++;
650 if (total_retries % 256 == 0) {
651 if (min_bytes >= (bytes + empty_size)) {
652 ret = -ENOSPC;
653 goto out;
654 }
655 /*
656 * grow our allocation a bit, we're not having
657 * much luck
658 */
659 min_bytes *= 2;
660 goto again;
661 }
662 } else {
663 last = next;
664 window_free += next->bytes;
665 if (entry->bytes > max_extent)
666 max_extent = entry->bytes;
667 }
668 }
669
670 cluster->window_start = entry->offset;
671
672 /*
673 * now we've found our entries, pull them out of the free space
674 * cache and put them into the cluster rbtree
675 *
676 * The cluster includes an rbtree, but only uses the offset index
677 * of each free space cache entry.
678 */
679 while(1) {
680 node = rb_next(&entry->offset_index);
681 unlink_free_space(block_group, entry);
682 ret = tree_insert_offset(&cluster->root, entry->offset,
683 &entry->offset_index);
684 BUG_ON(ret);
685
686 if (!node || entry == last)
687 break;
688
689 entry = rb_entry(node, struct btrfs_free_space, offset_index);
690 }
691 ret = 0;
692 cluster->max_size = max_extent;
693 atomic_inc(&block_group->count);
694 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
695 cluster->block_group = block_group;
696out:
697 spin_unlock(&cluster->lock);
698 spin_unlock(&block_group->tree_lock);
493 699
494 return ret; 700 return ret;
495} 701}
702
703/*
704 * simple code to zero out a cluster
705 */
706void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
707{
708 spin_lock_init(&cluster->lock);
709 spin_lock_init(&cluster->refill_lock);
710 cluster->root.rb_node = NULL;
711 cluster->max_size = 0;
712 INIT_LIST_HEAD(&cluster->block_group_list);
713 cluster->block_group = NULL;
714}
715
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644
index 000000000000..ab0bdc0a63ce
--- /dev/null
+++ b/fs/btrfs/free-space-cache.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (C) 2009 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_FREE_SPACE_CACHE
20#define __BTRFS_FREE_SPACE_CACHE
21
22int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
23 u64 bytenr, u64 size);
24int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
25 u64 bytenr, u64 size);
26void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
27 *block_group);
28u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
29 u64 offset, u64 bytes, u64 empty_size);
30void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
31 u64 bytes);
32u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
33int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
34 struct btrfs_block_group_cache *block_group,
35 struct btrfs_free_cluster *cluster,
36 u64 offset, u64 bytes, u64 empty_size);
37void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
38u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
39 struct btrfs_free_cluster *cluster, u64 bytes,
40 u64 min_start);
41int btrfs_return_cluster_to_free_space(
42 struct btrfs_block_group_cache *block_group,
43 struct btrfs_free_cluster *cluster);
44#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06d8db5afb08..a0d1dd492a58 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3481,8 +3481,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
3481 3481
3482 if (dir) { 3482 if (dir) {
3483 ret = btrfs_set_inode_index(dir, index); 3483 ret = btrfs_set_inode_index(dir, index);
3484 if (ret) 3484 if (ret) {
3485 iput(inode);
3485 return ERR_PTR(ret); 3486 return ERR_PTR(ret);
3487 }
3486 } 3488 }
3487 /* 3489 /*
3488 * index_cnt is ignored for everything but a dir, 3490 * index_cnt is ignored for everything but a dir,
@@ -3565,6 +3567,7 @@ fail:
3565 if (dir) 3567 if (dir)
3566 BTRFS_I(dir)->index_cnt--; 3568 BTRFS_I(dir)->index_cnt--;
3567 btrfs_free_path(path); 3569 btrfs_free_path(path);
3570 iput(inode);
3568 return ERR_PTR(ret); 3571 return ERR_PTR(ret);
3569} 3572}
3570 3573
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index a5310c0f41e2..1c36e5cd8f55 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
60 60
61/* 61/*
62 * unfortunately, many of the places that currently set a lock to blocking 62 * unfortunately, many of the places that currently set a lock to blocking
63 * don't end up blocking for every long, and often they don't block 63 * don't end up blocking for very long, and often they don't block
64 * at all. For a dbench 50 run, if we don't spin one the blocking bit 64 * at all. For a dbench 50 run, if we don't spin on the blocking bit
65 * at all, the context switch rate can jump up to 400,000/sec or more. 65 * at all, the context switch rate can jump up to 400,000/sec or more.
66 * 66 *
67 * So, we're still stuck with this crummy spin on the blocking bit, 67 * So, we're still stuck with this crummy spin on the blocking bit,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 19a4daf03ccb..9744af9d71e9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -24,6 +24,7 @@
24#include <linux/highmem.h> 24#include <linux/highmem.h>
25#include <linux/time.h> 25#include <linux/time.h>
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/seq_file.h>
27#include <linux/string.h> 28#include <linux/string.h>
28#include <linux/smp_lock.h> 29#include <linux/smp_lock.h>
29#include <linux/backing-dev.h> 30#include <linux/backing-dev.h>
@@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb)
66enum { 67enum {
67 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 68 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
68 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 69 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
69 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, 70 Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog,
71 Opt_flushoncommit, Opt_err,
70}; 72};
71 73
72static match_table_t tokens = { 74static match_table_t tokens = {
@@ -83,6 +85,8 @@ static match_table_t tokens = {
83 {Opt_compress, "compress"}, 85 {Opt_compress, "compress"},
84 {Opt_ssd, "ssd"}, 86 {Opt_ssd, "ssd"},
85 {Opt_noacl, "noacl"}, 87 {Opt_noacl, "noacl"},
88 {Opt_notreelog, "notreelog"},
89 {Opt_flushoncommit, "flushoncommit"},
86 {Opt_err, NULL}, 90 {Opt_err, NULL},
87}; 91};
88 92
@@ -222,6 +226,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
222 case Opt_noacl: 226 case Opt_noacl:
223 root->fs_info->sb->s_flags &= ~MS_POSIXACL; 227 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
224 break; 228 break;
229 case Opt_notreelog:
230 printk(KERN_INFO "btrfs: disabling tree log\n");
231 btrfs_set_opt(info->mount_opt, NOTREELOG);
232 break;
233 case Opt_flushoncommit:
234 printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
235 btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
236 break;
225 default: 237 default:
226 break; 238 break;
227 } 239 }
@@ -363,9 +375,8 @@ fail_close:
363int btrfs_sync_fs(struct super_block *sb, int wait) 375int btrfs_sync_fs(struct super_block *sb, int wait)
364{ 376{
365 struct btrfs_trans_handle *trans; 377 struct btrfs_trans_handle *trans;
366 struct btrfs_root *root; 378 struct btrfs_root *root = btrfs_sb(sb);
367 int ret; 379 int ret;
368 root = btrfs_sb(sb);
369 380
370 if (sb->s_flags & MS_RDONLY) 381 if (sb->s_flags & MS_RDONLY)
371 return 0; 382 return 0;
@@ -385,6 +396,41 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
385 return ret; 396 return ret;
386} 397}
387 398
399static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
400{
401 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
402 struct btrfs_fs_info *info = root->fs_info;
403
404 if (btrfs_test_opt(root, DEGRADED))
405 seq_puts(seq, ",degraded");
406 if (btrfs_test_opt(root, NODATASUM))
407 seq_puts(seq, ",nodatasum");
408 if (btrfs_test_opt(root, NODATACOW))
409 seq_puts(seq, ",nodatacow");
410 if (btrfs_test_opt(root, NOBARRIER))
411 seq_puts(seq, ",nobarrier");
412 if (info->max_extent != (u64)-1)
413 seq_printf(seq, ",max_extent=%llu", info->max_extent);
414 if (info->max_inline != 8192 * 1024)
415 seq_printf(seq, ",max_inline=%llu", info->max_inline);
416 if (info->alloc_start != 0)
417 seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
418 if (info->thread_pool_size != min_t(unsigned long,
419 num_online_cpus() + 2, 8))
420 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
421 if (btrfs_test_opt(root, COMPRESS))
422 seq_puts(seq, ",compress");
423 if (btrfs_test_opt(root, SSD))
424 seq_puts(seq, ",ssd");
425 if (btrfs_test_opt(root, NOTREELOG))
426 seq_puts(seq, ",no-treelog");
427 if (btrfs_test_opt(root, FLUSHONCOMMIT))
428 seq_puts(seq, ",flush-on-commit");
429 if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
430 seq_puts(seq, ",noacl");
431 return 0;
432}
433
388static void btrfs_write_super(struct super_block *sb) 434static void btrfs_write_super(struct super_block *sb)
389{ 435{
390 sb->s_dirt = 0; 436 sb->s_dirt = 0;
@@ -630,7 +676,7 @@ static struct super_operations btrfs_super_ops = {
630 .put_super = btrfs_put_super, 676 .put_super = btrfs_put_super,
631 .write_super = btrfs_write_super, 677 .write_super = btrfs_write_super,
632 .sync_fs = btrfs_sync_fs, 678 .sync_fs = btrfs_sync_fs,
633 .show_options = generic_show_options, 679 .show_options = btrfs_show_options,
634 .write_inode = btrfs_write_inode, 680 .write_inode = btrfs_write_inode,
635 .dirty_inode = btrfs_dirty_inode, 681 .dirty_inode = btrfs_dirty_inode,
636 .alloc_inode = btrfs_alloc_inode, 682 .alloc_inode = btrfs_alloc_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 664782c6a2df..2869b3361eb6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root)
53 GFP_NOFS); 53 GFP_NOFS);
54 BUG_ON(!cur_trans); 54 BUG_ON(!cur_trans);
55 root->fs_info->generation++; 55 root->fs_info->generation++;
56 root->fs_info->last_alloc = 0;
57 root->fs_info->last_data_alloc = 0;
58 cur_trans->num_writers = 1; 56 cur_trans->num_writers = 1;
59 cur_trans->num_joined = 0; 57 cur_trans->num_joined = 0;
60 cur_trans->transid = root->fs_info->generation; 58 cur_trans->transid = root->fs_info->generation;
@@ -974,6 +972,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
974 int ret; 972 int ret;
975 int should_grow = 0; 973 int should_grow = 0;
976 unsigned long now = get_seconds(); 974 unsigned long now = get_seconds();
975 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
977 976
978 btrfs_run_ordered_operations(root, 0); 977 btrfs_run_ordered_operations(root, 0);
979 978
@@ -1053,7 +1052,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1053 1052
1054 mutex_unlock(&root->fs_info->trans_mutex); 1053 mutex_unlock(&root->fs_info->trans_mutex);
1055 1054
1056 if (snap_pending) { 1055 if (flush_on_commit || snap_pending) {
1056 if (flush_on_commit)
1057 btrfs_start_delalloc_inodes(root);
1057 ret = btrfs_wait_ordered_extents(root, 1); 1058 ret = btrfs_wait_ordered_extents(root, 1);
1058 BUG_ON(ret); 1059 BUG_ON(ret);
1059 } 1060 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fc9b87a7975b..25f20ea11f27 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -262,11 +262,9 @@ static int process_one_buffer(struct btrfs_root *log,
262 struct extent_buffer *eb, 262 struct extent_buffer *eb,
263 struct walk_control *wc, u64 gen) 263 struct walk_control *wc, u64 gen)
264{ 264{
265 if (wc->pin) { 265 if (wc->pin)
266 mutex_lock(&log->fs_info->pinned_mutex);
267 btrfs_update_pinned_extents(log->fs_info->extent_root, 266 btrfs_update_pinned_extents(log->fs_info->extent_root,
268 eb->start, eb->len, 1); 267 eb->start, eb->len, 1);
269 }
270 268
271 if (btrfs_buffer_uptodate(eb, gen)) { 269 if (btrfs_buffer_uptodate(eb, gen)) {
272 if (wc->write) 270 if (wc->write)
@@ -1224,8 +1222,7 @@ insert:
1224 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1222 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1225 name, name_len, log_type, &log_key); 1223 name, name_len, log_type, &log_key);
1226 1224
1227 if (ret && ret != -ENOENT) 1225 BUG_ON(ret && ret != -ENOENT);
1228 BUG();
1229 goto out; 1226 goto out;
1230} 1227}
1231 1228
@@ -2900,6 +2897,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2900 2897
2901 sb = inode->i_sb; 2898 sb = inode->i_sb;
2902 2899
2900 if (btrfs_test_opt(root, NOTREELOG)) {
2901 ret = 1;
2902 goto end_no_trans;
2903 }
2904
2903 if (root->fs_info->last_trans_log_full_commit > 2905 if (root->fs_info->last_trans_log_full_commit >
2904 root->fs_info->last_trans_committed) { 2906 root->fs_info->last_trans_committed) {
2905 ret = 1; 2907 ret = 1;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd06e18e5aac..e0913e469728 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,6 +20,7 @@
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/iocontext.h>
23#include <asm/div64.h> 24#include <asm/div64.h>
24#include "compat.h" 25#include "compat.h"
25#include "ctree.h" 26#include "ctree.h"
@@ -145,8 +146,9 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
145 int again = 0; 146 int again = 0;
146 unsigned long num_run = 0; 147 unsigned long num_run = 0;
147 unsigned long limit; 148 unsigned long limit;
149 unsigned long last_waited = 0;
148 150
149 bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; 151 bdi = blk_get_backing_dev_info(device->bdev);
150 fs_info = device->dev_root->fs_info; 152 fs_info = device->dev_root->fs_info;
151 limit = btrfs_async_submit_limit(fs_info); 153 limit = btrfs_async_submit_limit(fs_info);
152 limit = limit * 2 / 3; 154 limit = limit * 2 / 3;
@@ -207,7 +209,32 @@ loop_lock:
207 if (pending && bdi_write_congested(bdi) && num_run > 16 && 209 if (pending && bdi_write_congested(bdi) && num_run > 16 &&
208 fs_info->fs_devices->open_devices > 1) { 210 fs_info->fs_devices->open_devices > 1) {
209 struct bio *old_head; 211 struct bio *old_head;
212 struct io_context *ioc;
210 213
214 ioc = current->io_context;
215
216 /*
217 * the main goal here is that we don't want to
218 * block if we're going to be able to submit
219 * more requests without blocking.
220 *
221 * This code does two great things, it pokes into
222 * the elevator code from a filesystem _and_
223 * it makes assumptions about how batching works.
224 */
225 if (ioc && ioc->nr_batch_requests > 0 &&
226 time_before(jiffies, ioc->last_waited + HZ/50UL) &&
227 (last_waited == 0 ||
228 ioc->last_waited == last_waited)) {
229 /*
230 * we want to go through our batch of
231 * requests and stop. So, we copy out
232 * the ioc->last_waited time and test
233 * against it before looping
234 */
235 last_waited = ioc->last_waited;
236 continue;
237 }
211 spin_lock(&device->io_lock); 238 spin_lock(&device->io_lock);
212 239
213 old_head = device->pending_bios; 240 old_head = device->pending_bios;
@@ -231,6 +258,18 @@ loop_lock:
231 if (device->pending_bios) 258 if (device->pending_bios)
232 goto loop_lock; 259 goto loop_lock;
233 spin_unlock(&device->io_lock); 260 spin_unlock(&device->io_lock);
261
262 /*
263 * IO has already been through a long path to get here. Checksumming,
264 * async helper threads, perhaps compression. We've done a pretty
265 * good job of collecting a batch of IO and should just unplug
266 * the device right away.
267 *
268 * This will help anyone who is waiting on the IO, they might have
269 * already unplugged, but managed to do so before the bio they
270 * cared about found its way down here.
271 */
272 blk_run_backing_dev(bdi, NULL);
234done: 273done:
235 return 0; 274 return 0;
236} 275}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 86c44e9ae110..2185de72ff7d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -76,7 +76,7 @@ struct btrfs_device {
76struct btrfs_fs_devices { 76struct btrfs_fs_devices {
77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ 77 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
78 78
79 /* the device with this id has the most recent coyp of the super */ 79 /* the device with this id has the most recent copy of the super */
80 u64 latest_devid; 80 u64 latest_devid;
81 u64 latest_trans; 81 u64 latest_trans;
82 u64 num_devices; 82 u64 num_devices;