aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-03-02 16:08:56 -0500
committerIngo Molnar <mingo@elte.hu>2009-03-02 16:08:56 -0500
commitc02368a9d059322f913a58111eade87a656fefd5 (patch)
tree2f02dbbe69b86535f58d2010d9adfb20a9c16fb9 /fs/btrfs/extent-tree.c
parentf17c75453b2d195eba0a90d9f16a3ba88c85b3b4 (diff)
parent778ef1e6cbb049c9bcbf405936ee6f2b6e451892 (diff)
Merge branch 'linus' into irq/genirq
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c771
1 files changed, 652 insertions, 119 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 293da650873f..6b5966aacf44 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,7 +19,7 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/version.h> 22#include <linux/sort.h>
23#include "compat.h" 23#include "compat.h"
24#include "hash.h" 24#include "hash.h"
25#include "crc32c.h" 25#include "crc32c.h"
@@ -30,7 +30,6 @@
30#include "volumes.h" 30#include "volumes.h"
31#include "locking.h" 31#include "locking.h"
32#include "ref-cache.h" 32#include "ref-cache.h"
33#include "compat.h"
34 33
35#define PENDING_EXTENT_INSERT 0 34#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 35#define PENDING_EXTENT_DELETE 1
@@ -61,6 +60,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
61 u64 bytenr, u64 num_bytes, int alloc, 60 u64 bytenr, u64 num_bytes, int alloc,
62 int mark_free); 61 int mark_free);
63 62
63static int do_chunk_alloc(struct btrfs_trans_handle *trans,
64 struct btrfs_root *extent_root, u64 alloc_bytes,
65 u64 flags, int force);
66
64static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 67static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65{ 68{
66 return (cache->flags & bits) == bits; 69 return (cache->flags & bits) == bits;
@@ -326,10 +329,8 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
326 u64 flags) 329 u64 flags)
327{ 330{
328 struct list_head *head = &info->space_info; 331 struct list_head *head = &info->space_info;
329 struct list_head *cur;
330 struct btrfs_space_info *found; 332 struct btrfs_space_info *found;
331 list_for_each(cur, head) { 333 list_for_each_entry(found, head, list) {
332 found = list_entry(cur, struct btrfs_space_info, list);
333 if (found->flags == flags) 334 if (found->flags == flags)
334 return found; 335 return found;
335 } 336 }
@@ -1326,8 +1327,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1326int btrfs_extent_post_op(struct btrfs_trans_handle *trans, 1327int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root) 1328 struct btrfs_root *root)
1328{ 1329{
1329 finish_current_insert(trans, root->fs_info->extent_root, 1); 1330 u64 start;
1330 del_pending_extents(trans, root->fs_info->extent_root, 1); 1331 u64 end;
1332 int ret;
1333
1334 while(1) {
1335 finish_current_insert(trans, root->fs_info->extent_root, 1);
1336 del_pending_extents(trans, root->fs_info->extent_root, 1);
1337
1338 /* is there more work to do? */
1339 ret = find_first_extent_bit(&root->fs_info->pending_del,
1340 0, &start, &end, EXTENT_WRITEBACK);
1341 if (!ret)
1342 continue;
1343 ret = find_first_extent_bit(&root->fs_info->extent_ins,
1344 0, &start, &end, EXTENT_WRITEBACK);
1345 if (!ret)
1346 continue;
1347 break;
1348 }
1331 return 0; 1349 return 0;
1332} 1350}
1333 1351
@@ -1525,15 +1543,55 @@ out:
1525 return ret; 1543 return ret;
1526} 1544}
1527 1545
1528int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1546/* when a block goes through cow, we update the reference counts of
1529 struct extent_buffer *orig_buf, struct extent_buffer *buf, 1547 * everything that block points to. The internal pointers of the block
1530 u32 *nr_extents) 1548 * can be in just about any order, and it is likely to have clusters of
1549 * things that are close together and clusters of things that are not.
1550 *
1551 * To help reduce the seeks that come with updating all of these reference
1552 * counts, sort them by byte number before actual updates are done.
1553 *
1554 * struct refsort is used to match byte number to slot in the btree block.
1555 * we sort based on the byte number and then use the slot to actually
1556 * find the item.
1557 *
1558 * struct refsort is smaller than strcut btrfs_item and smaller than
1559 * struct btrfs_key_ptr. Since we're currently limited to the page size
1560 * for a btree block, there's no way for a kmalloc of refsorts for a
1561 * single node to be bigger than a page.
1562 */
1563struct refsort {
1564 u64 bytenr;
1565 u32 slot;
1566};
1567
1568/*
1569 * for passing into sort()
1570 */
1571static int refsort_cmp(const void *a_void, const void *b_void)
1572{
1573 const struct refsort *a = a_void;
1574 const struct refsort *b = b_void;
1575
1576 if (a->bytenr < b->bytenr)
1577 return -1;
1578 if (a->bytenr > b->bytenr)
1579 return 1;
1580 return 0;
1581}
1582
1583
1584noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1585 struct btrfs_root *root,
1586 struct extent_buffer *orig_buf,
1587 struct extent_buffer *buf, u32 *nr_extents)
1531{ 1588{
1532 u64 bytenr; 1589 u64 bytenr;
1533 u64 ref_root; 1590 u64 ref_root;
1534 u64 orig_root; 1591 u64 orig_root;
1535 u64 ref_generation; 1592 u64 ref_generation;
1536 u64 orig_generation; 1593 u64 orig_generation;
1594 struct refsort *sorted;
1537 u32 nritems; 1595 u32 nritems;
1538 u32 nr_file_extents = 0; 1596 u32 nr_file_extents = 0;
1539 struct btrfs_key key; 1597 struct btrfs_key key;
@@ -1542,6 +1600,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1542 int level; 1600 int level;
1543 int ret = 0; 1601 int ret = 0;
1544 int faili = 0; 1602 int faili = 0;
1603 int refi = 0;
1604 int slot;
1545 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1605 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1546 u64, u64, u64, u64, u64, u64, u64, u64); 1606 u64, u64, u64, u64, u64, u64, u64, u64);
1547 1607
@@ -1553,6 +1613,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1553 nritems = btrfs_header_nritems(buf); 1613 nritems = btrfs_header_nritems(buf);
1554 level = btrfs_header_level(buf); 1614 level = btrfs_header_level(buf);
1555 1615
1616 sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
1617 BUG_ON(!sorted);
1618
1556 if (root->ref_cows) { 1619 if (root->ref_cows) {
1557 process_func = __btrfs_inc_extent_ref; 1620 process_func = __btrfs_inc_extent_ref;
1558 } else { 1621 } else {
@@ -1565,6 +1628,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1565 process_func = __btrfs_update_extent_ref; 1628 process_func = __btrfs_update_extent_ref;
1566 } 1629 }
1567 1630
1631 /*
1632 * we make two passes through the items. In the first pass we
1633 * only record the byte number and slot. Then we sort based on
1634 * byte number and do the actual work based on the sorted results
1635 */
1568 for (i = 0; i < nritems; i++) { 1636 for (i = 0; i < nritems; i++) {
1569 cond_resched(); 1637 cond_resched();
1570 if (level == 0) { 1638 if (level == 0) {
@@ -1581,6 +1649,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1581 continue; 1649 continue;
1582 1650
1583 nr_file_extents++; 1651 nr_file_extents++;
1652 sorted[refi].bytenr = bytenr;
1653 sorted[refi].slot = i;
1654 refi++;
1655 } else {
1656 bytenr = btrfs_node_blockptr(buf, i);
1657 sorted[refi].bytenr = bytenr;
1658 sorted[refi].slot = i;
1659 refi++;
1660 }
1661 }
1662 /*
1663 * if refi == 0, we didn't actually put anything into the sorted
1664 * array and we're done
1665 */
1666 if (refi == 0)
1667 goto out;
1668
1669 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1670
1671 for (i = 0; i < refi; i++) {
1672 cond_resched();
1673 slot = sorted[i].slot;
1674 bytenr = sorted[i].bytenr;
1675
1676 if (level == 0) {
1677 btrfs_item_key_to_cpu(buf, &key, slot);
1584 1678
1585 ret = process_func(trans, root, bytenr, 1679 ret = process_func(trans, root, bytenr,
1586 orig_buf->start, buf->start, 1680 orig_buf->start, buf->start,
@@ -1589,25 +1683,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1589 key.objectid); 1683 key.objectid);
1590 1684
1591 if (ret) { 1685 if (ret) {
1592 faili = i; 1686 faili = slot;
1593 WARN_ON(1); 1687 WARN_ON(1);
1594 goto fail; 1688 goto fail;
1595 } 1689 }
1596 } else { 1690 } else {
1597 bytenr = btrfs_node_blockptr(buf, i);
1598 ret = process_func(trans, root, bytenr, 1691 ret = process_func(trans, root, bytenr,
1599 orig_buf->start, buf->start, 1692 orig_buf->start, buf->start,
1600 orig_root, ref_root, 1693 orig_root, ref_root,
1601 orig_generation, ref_generation, 1694 orig_generation, ref_generation,
1602 level - 1); 1695 level - 1);
1603 if (ret) { 1696 if (ret) {
1604 faili = i; 1697 faili = slot;
1605 WARN_ON(1); 1698 WARN_ON(1);
1606 goto fail; 1699 goto fail;
1607 } 1700 }
1608 } 1701 }
1609 } 1702 }
1610out: 1703out:
1704 kfree(sorted);
1611 if (nr_extents) { 1705 if (nr_extents) {
1612 if (level == 0) 1706 if (level == 0)
1613 *nr_extents = nr_file_extents; 1707 *nr_extents = nr_file_extents;
@@ -1616,6 +1710,7 @@ out:
1616 } 1710 }
1617 return 0; 1711 return 0;
1618fail: 1712fail:
1713 kfree(sorted);
1619 WARN_ON(1); 1714 WARN_ON(1);
1620 return ret; 1715 return ret;
1621} 1716}
@@ -1818,6 +1913,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1818 found->bytes_pinned = 0; 1913 found->bytes_pinned = 0;
1819 found->bytes_reserved = 0; 1914 found->bytes_reserved = 0;
1820 found->bytes_readonly = 0; 1915 found->bytes_readonly = 0;
1916 found->bytes_delalloc = 0;
1821 found->full = 0; 1917 found->full = 0;
1822 found->force_alloc = 0; 1918 found->force_alloc = 0;
1823 *space_info = found; 1919 *space_info = found;
@@ -1881,6 +1977,233 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1881 return flags; 1977 return flags;
1882} 1978}
1883 1979
1980static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
1981{
1982 struct btrfs_fs_info *info = root->fs_info;
1983 u64 alloc_profile;
1984
1985 if (data) {
1986 alloc_profile = info->avail_data_alloc_bits &
1987 info->data_alloc_profile;
1988 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
1989 } else if (root == root->fs_info->chunk_root) {
1990 alloc_profile = info->avail_system_alloc_bits &
1991 info->system_alloc_profile;
1992 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
1993 } else {
1994 alloc_profile = info->avail_metadata_alloc_bits &
1995 info->metadata_alloc_profile;
1996 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
1997 }
1998
1999 return btrfs_reduce_alloc_profile(root, data);
2000}
2001
2002void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2003{
2004 u64 alloc_target;
2005
2006 alloc_target = btrfs_get_alloc_profile(root, 1);
2007 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2008 alloc_target);
2009}
2010
2011/*
2012 * for now this just makes sure we have at least 5% of our metadata space free
2013 * for use.
2014 */
2015int btrfs_check_metadata_free_space(struct btrfs_root *root)
2016{
2017 struct btrfs_fs_info *info = root->fs_info;
2018 struct btrfs_space_info *meta_sinfo;
2019 u64 alloc_target, thresh;
2020 int committed = 0, ret;
2021
2022 /* get the space info for where the metadata will live */
2023 alloc_target = btrfs_get_alloc_profile(root, 0);
2024 meta_sinfo = __find_space_info(info, alloc_target);
2025
2026again:
2027 spin_lock(&meta_sinfo->lock);
2028 if (!meta_sinfo->full)
2029 thresh = meta_sinfo->total_bytes * 80;
2030 else
2031 thresh = meta_sinfo->total_bytes * 95;
2032
2033 do_div(thresh, 100);
2034
2035 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2036 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
2037 struct btrfs_trans_handle *trans;
2038 if (!meta_sinfo->full) {
2039 meta_sinfo->force_alloc = 1;
2040 spin_unlock(&meta_sinfo->lock);
2041
2042 trans = btrfs_start_transaction(root, 1);
2043 if (!trans)
2044 return -ENOMEM;
2045
2046 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2047 2 * 1024 * 1024, alloc_target, 0);
2048 btrfs_end_transaction(trans, root);
2049 goto again;
2050 }
2051 spin_unlock(&meta_sinfo->lock);
2052
2053 if (!committed) {
2054 committed = 1;
2055 trans = btrfs_join_transaction(root, 1);
2056 if (!trans)
2057 return -ENOMEM;
2058 ret = btrfs_commit_transaction(trans, root);
2059 if (ret)
2060 return ret;
2061 goto again;
2062 }
2063 return -ENOSPC;
2064 }
2065 spin_unlock(&meta_sinfo->lock);
2066
2067 return 0;
2068}
2069
2070/*
2071 * This will check the space that the inode allocates from to make sure we have
2072 * enough space for bytes.
2073 */
2074int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2075 u64 bytes)
2076{
2077 struct btrfs_space_info *data_sinfo;
2078 int ret = 0, committed = 0;
2079
2080 /* make sure bytes are sectorsize aligned */
2081 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2082
2083 data_sinfo = BTRFS_I(inode)->space_info;
2084again:
2085 /* make sure we have enough space to handle the data first */
2086 spin_lock(&data_sinfo->lock);
2087 if (data_sinfo->total_bytes - data_sinfo->bytes_used -
2088 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
2089 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
2090 data_sinfo->bytes_may_use < bytes) {
2091 struct btrfs_trans_handle *trans;
2092
2093 /*
2094 * if we don't have enough free bytes in this space then we need
2095 * to alloc a new chunk.
2096 */
2097 if (!data_sinfo->full) {
2098 u64 alloc_target;
2099
2100 data_sinfo->force_alloc = 1;
2101 spin_unlock(&data_sinfo->lock);
2102
2103 alloc_target = btrfs_get_alloc_profile(root, 1);
2104 trans = btrfs_start_transaction(root, 1);
2105 if (!trans)
2106 return -ENOMEM;
2107
2108 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2109 bytes + 2 * 1024 * 1024,
2110 alloc_target, 0);
2111 btrfs_end_transaction(trans, root);
2112 if (ret)
2113 return ret;
2114 goto again;
2115 }
2116 spin_unlock(&data_sinfo->lock);
2117
2118 /* commit the current transaction and try again */
2119 if (!committed) {
2120 committed = 1;
2121 trans = btrfs_join_transaction(root, 1);
2122 if (!trans)
2123 return -ENOMEM;
2124 ret = btrfs_commit_transaction(trans, root);
2125 if (ret)
2126 return ret;
2127 goto again;
2128 }
2129
2130 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
2131 ", %llu bytes_used, %llu bytes_reserved, "
2132 "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
2133 "%llu total\n", bytes, data_sinfo->bytes_delalloc,
2134 data_sinfo->bytes_used, data_sinfo->bytes_reserved,
2135 data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
2136 data_sinfo->bytes_may_use, data_sinfo->total_bytes);
2137 return -ENOSPC;
2138 }
2139 data_sinfo->bytes_may_use += bytes;
2140 BTRFS_I(inode)->reserved_bytes += bytes;
2141 spin_unlock(&data_sinfo->lock);
2142
2143 return btrfs_check_metadata_free_space(root);
2144}
2145
2146/*
2147 * if there was an error for whatever reason after calling
2148 * btrfs_check_data_free_space, call this so we can cleanup the counters.
2149 */
2150void btrfs_free_reserved_data_space(struct btrfs_root *root,
2151 struct inode *inode, u64 bytes)
2152{
2153 struct btrfs_space_info *data_sinfo;
2154
2155 /* make sure bytes are sectorsize aligned */
2156 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2157
2158 data_sinfo = BTRFS_I(inode)->space_info;
2159 spin_lock(&data_sinfo->lock);
2160 data_sinfo->bytes_may_use -= bytes;
2161 BTRFS_I(inode)->reserved_bytes -= bytes;
2162 spin_unlock(&data_sinfo->lock);
2163}
2164
2165/* called when we are adding a delalloc extent to the inode's io_tree */
2166void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
2167 u64 bytes)
2168{
2169 struct btrfs_space_info *data_sinfo;
2170
2171 /* get the space info for where this inode will be storing its data */
2172 data_sinfo = BTRFS_I(inode)->space_info;
2173
2174 /* make sure we have enough space to handle the data first */
2175 spin_lock(&data_sinfo->lock);
2176 data_sinfo->bytes_delalloc += bytes;
2177
2178 /*
2179 * we are adding a delalloc extent without calling
2180 * btrfs_check_data_free_space first. This happens on a weird
2181 * writepage condition, but shouldn't hurt our accounting
2182 */
2183 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
2184 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
2185 BTRFS_I(inode)->reserved_bytes = 0;
2186 } else {
2187 data_sinfo->bytes_may_use -= bytes;
2188 BTRFS_I(inode)->reserved_bytes -= bytes;
2189 }
2190
2191 spin_unlock(&data_sinfo->lock);
2192}
2193
2194/* called when we are clearing an delalloc extent from the inode's io_tree */
2195void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
2196 u64 bytes)
2197{
2198 struct btrfs_space_info *info;
2199
2200 info = BTRFS_I(inode)->space_info;
2201
2202 spin_lock(&info->lock);
2203 info->bytes_delalloc -= bytes;
2204 spin_unlock(&info->lock);
2205}
2206
1884static int do_chunk_alloc(struct btrfs_trans_handle *trans, 2207static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1885 struct btrfs_root *extent_root, u64 alloc_bytes, 2208 struct btrfs_root *extent_root, u64 alloc_bytes,
1886 u64 flags, int force) 2209 u64 flags, int force)
@@ -2137,13 +2460,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
2137 u64 end; 2460 u64 end;
2138 u64 priv; 2461 u64 priv;
2139 u64 search = 0; 2462 u64 search = 0;
2140 u64 skipped = 0;
2141 struct btrfs_fs_info *info = extent_root->fs_info; 2463 struct btrfs_fs_info *info = extent_root->fs_info;
2142 struct btrfs_path *path; 2464 struct btrfs_path *path;
2143 struct pending_extent_op *extent_op, *tmp; 2465 struct pending_extent_op *extent_op, *tmp;
2144 struct list_head insert_list, update_list; 2466 struct list_head insert_list, update_list;
2145 int ret; 2467 int ret;
2146 int num_inserts = 0, max_inserts; 2468 int num_inserts = 0, max_inserts, restart = 0;
2147 2469
2148 path = btrfs_alloc_path(); 2470 path = btrfs_alloc_path();
2149 INIT_LIST_HEAD(&insert_list); 2471 INIT_LIST_HEAD(&insert_list);
@@ -2159,18 +2481,19 @@ again:
2159 ret = find_first_extent_bit(&info->extent_ins, search, &start, 2481 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2160 &end, EXTENT_WRITEBACK); 2482 &end, EXTENT_WRITEBACK);
2161 if (ret) { 2483 if (ret) {
2162 if (skipped && all && !num_inserts) { 2484 if (restart && !num_inserts &&
2163 skipped = 0; 2485 list_empty(&update_list)) {
2486 restart = 0;
2164 search = 0; 2487 search = 0;
2165 continue; 2488 continue;
2166 } 2489 }
2167 mutex_unlock(&info->extent_ins_mutex);
2168 break; 2490 break;
2169 } 2491 }
2170 2492
2171 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); 2493 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2172 if (!ret) { 2494 if (!ret) {
2173 skipped = 1; 2495 if (all)
2496 restart = 1;
2174 search = end + 1; 2497 search = end + 1;
2175 if (need_resched()) { 2498 if (need_resched()) {
2176 mutex_unlock(&info->extent_ins_mutex); 2499 mutex_unlock(&info->extent_ins_mutex);
@@ -2189,7 +2512,7 @@ again:
2189 list_add_tail(&extent_op->list, &insert_list); 2512 list_add_tail(&extent_op->list, &insert_list);
2190 search = end + 1; 2513 search = end + 1;
2191 if (num_inserts == max_inserts) { 2514 if (num_inserts == max_inserts) {
2192 mutex_unlock(&info->extent_ins_mutex); 2515 restart = 1;
2193 break; 2516 break;
2194 } 2517 }
2195 } else if (extent_op->type == PENDING_BACKREF_UPDATE) { 2518 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
@@ -2205,7 +2528,6 @@ again:
2205 * somebody marked this thing for deletion then just unlock it and be 2528 * somebody marked this thing for deletion then just unlock it and be
2206 * done, the free_extents will handle it 2529 * done, the free_extents will handle it
2207 */ 2530 */
2208 mutex_lock(&info->extent_ins_mutex);
2209 list_for_each_entry_safe(extent_op, tmp, &update_list, list) { 2531 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2210 clear_extent_bits(&info->extent_ins, extent_op->bytenr, 2532 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2211 extent_op->bytenr + extent_op->num_bytes - 1, 2533 extent_op->bytenr + extent_op->num_bytes - 1,
@@ -2227,6 +2549,10 @@ again:
2227 if (!list_empty(&update_list)) { 2549 if (!list_empty(&update_list)) {
2228 ret = update_backrefs(trans, extent_root, path, &update_list); 2550 ret = update_backrefs(trans, extent_root, path, &update_list);
2229 BUG_ON(ret); 2551 BUG_ON(ret);
2552
2553 /* we may have COW'ed new blocks, so lets start over */
2554 if (all)
2555 restart = 1;
2230 } 2556 }
2231 2557
2232 /* 2558 /*
@@ -2234,9 +2560,9 @@ again:
2234 * need to make sure everything is cleaned then reset everything and 2560 * need to make sure everything is cleaned then reset everything and
2235 * go back to the beginning 2561 * go back to the beginning
2236 */ 2562 */
2237 if (!num_inserts && all && skipped) { 2563 if (!num_inserts && restart) {
2238 search = 0; 2564 search = 0;
2239 skipped = 0; 2565 restart = 0;
2240 INIT_LIST_HEAD(&update_list); 2566 INIT_LIST_HEAD(&update_list);
2241 INIT_LIST_HEAD(&insert_list); 2567 INIT_LIST_HEAD(&insert_list);
2242 goto again; 2568 goto again;
@@ -2293,27 +2619,19 @@ again:
2293 BUG_ON(ret); 2619 BUG_ON(ret);
2294 2620
2295 /* 2621 /*
2296 * if we broke out of the loop in order to insert stuff because we hit 2622 * if restart is set for whatever reason we need to go back and start
2297 * the maximum number of inserts at a time we can handle, then loop 2623 * searching through the pending list again.
2298 * back and pick up where we left off 2624 *
2299 */ 2625 * We just inserted some extents, which could have resulted in new
2300 if (num_inserts == max_inserts) { 2626 * blocks being allocated, which would result in new blocks needing
2301 INIT_LIST_HEAD(&insert_list); 2627 * updates, so if all is set we _must_ restart to get the updated
2302 INIT_LIST_HEAD(&update_list); 2628 * blocks.
2303 num_inserts = 0;
2304 goto again;
2305 }
2306
2307 /*
2308 * again, if we need to make absolutely sure there are no more pending
2309 * extent operations left and we know that we skipped some, go back to
2310 * the beginning and do it all again
2311 */ 2629 */
2312 if (all && skipped) { 2630 if (restart || all) {
2313 INIT_LIST_HEAD(&insert_list); 2631 INIT_LIST_HEAD(&insert_list);
2314 INIT_LIST_HEAD(&update_list); 2632 INIT_LIST_HEAD(&update_list);
2315 search = 0; 2633 search = 0;
2316 skipped = 0; 2634 restart = 0;
2317 num_inserts = 0; 2635 num_inserts = 0;
2318 goto again; 2636 goto again;
2319 } 2637 }
@@ -2547,6 +2865,7 @@ again:
2547 if (ret) { 2865 if (ret) {
2548 if (all && skipped && !nr) { 2866 if (all && skipped && !nr) {
2549 search = 0; 2867 search = 0;
2868 skipped = 0;
2550 continue; 2869 continue;
2551 } 2870 }
2552 mutex_unlock(&info->extent_ins_mutex); 2871 mutex_unlock(&info->extent_ins_mutex);
@@ -2633,6 +2952,8 @@ again:
2633 goto again; 2952 goto again;
2634 } 2953 }
2635 2954
2955 if (!err)
2956 finish_current_insert(trans, extent_root, 0);
2636 return err; 2957 return err;
2637} 2958}
2638 2959
@@ -2700,13 +3021,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2700 /* if metadata always pin */ 3021 /* if metadata always pin */
2701 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 3022 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2702 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3023 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2703 struct btrfs_block_group_cache *cache; 3024 mutex_lock(&root->fs_info->pinned_mutex);
2704 3025 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2705 /* btrfs_free_reserved_extent */ 3026 mutex_unlock(&root->fs_info->pinned_mutex);
2706 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2707 BUG_ON(!cache);
2708 btrfs_add_free_space(cache, bytenr, num_bytes);
2709 put_block_group(cache);
2710 update_reserved_extents(root, bytenr, num_bytes, 0); 3027 update_reserved_extents(root, bytenr, num_bytes, 0);
2711 return 0; 3028 return 0;
2712 } 3029 }
@@ -2787,7 +3104,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2787 3104
2788 if (data & BTRFS_BLOCK_GROUP_METADATA) { 3105 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2789 last_ptr = &root->fs_info->last_alloc; 3106 last_ptr = &root->fs_info->last_alloc;
2790 empty_cluster = 64 * 1024; 3107 if (!btrfs_test_opt(root, SSD))
3108 empty_cluster = 64 * 1024;
2791 } 3109 }
2792 3110
2793 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 3111 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
@@ -3014,16 +3332,18 @@ loop_check:
3014static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 3332static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3015{ 3333{
3016 struct btrfs_block_group_cache *cache; 3334 struct btrfs_block_group_cache *cache;
3017 struct list_head *l;
3018 3335
3019 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 3336 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3020 (unsigned long long)(info->total_bytes - info->bytes_used - 3337 (unsigned long long)(info->total_bytes - info->bytes_used -
3021 info->bytes_pinned - info->bytes_reserved), 3338 info->bytes_pinned - info->bytes_reserved),
3022 (info->full) ? "" : "not "); 3339 (info->full) ? "" : "not ");
3340 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
3341 " may_use=%llu, used=%llu\n", info->total_bytes,
3342 info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
3343 info->bytes_used);
3023 3344
3024 down_read(&info->groups_sem); 3345 down_read(&info->groups_sem);
3025 list_for_each(l, &info->block_groups) { 3346 list_for_each_entry(cache, &info->block_groups, list) {
3026 cache = list_entry(l, struct btrfs_block_group_cache, list);
3027 spin_lock(&cache->lock); 3347 spin_lock(&cache->lock);
3028 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 3348 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3029 "%llu pinned %llu reserved\n", 3349 "%llu pinned %llu reserved\n",
@@ -3047,24 +3367,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3047{ 3367{
3048 int ret; 3368 int ret;
3049 u64 search_start = 0; 3369 u64 search_start = 0;
3050 u64 alloc_profile;
3051 struct btrfs_fs_info *info = root->fs_info; 3370 struct btrfs_fs_info *info = root->fs_info;
3052 3371
3053 if (data) { 3372 data = btrfs_get_alloc_profile(root, data);
3054 alloc_profile = info->avail_data_alloc_bits &
3055 info->data_alloc_profile;
3056 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
3057 } else if (root == root->fs_info->chunk_root) {
3058 alloc_profile = info->avail_system_alloc_bits &
3059 info->system_alloc_profile;
3060 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
3061 } else {
3062 alloc_profile = info->avail_metadata_alloc_bits &
3063 info->metadata_alloc_profile;
3064 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
3065 }
3066again: 3373again:
3067 data = btrfs_reduce_alloc_profile(root, data);
3068 /* 3374 /*
3069 * the only place that sets empty_size is btrfs_realloc_node, which 3375 * the only place that sets empty_size is btrfs_realloc_node, which
3070 * is not called recursively on allocations 3376 * is not called recursively on allocations
@@ -3332,7 +3638,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3332 3638
3333struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 3639struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3334 struct btrfs_root *root, 3640 struct btrfs_root *root,
3335 u64 bytenr, u32 blocksize) 3641 u64 bytenr, u32 blocksize,
3642 int level)
3336{ 3643{
3337 struct extent_buffer *buf; 3644 struct extent_buffer *buf;
3338 3645
@@ -3340,9 +3647,13 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3340 if (!buf) 3647 if (!buf)
3341 return ERR_PTR(-ENOMEM); 3648 return ERR_PTR(-ENOMEM);
3342 btrfs_set_header_generation(buf, trans->transid); 3649 btrfs_set_header_generation(buf, trans->transid);
3650 btrfs_set_buffer_lockdep_class(buf, level);
3343 btrfs_tree_lock(buf); 3651 btrfs_tree_lock(buf);
3344 clean_tree_block(trans, root, buf); 3652 clean_tree_block(trans, root, buf);
3653
3654 btrfs_set_lock_blocking(buf);
3345 btrfs_set_buffer_uptodate(buf); 3655 btrfs_set_buffer_uptodate(buf);
3656
3346 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3657 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3347 set_extent_dirty(&root->dirty_log_pages, buf->start, 3658 set_extent_dirty(&root->dirty_log_pages, buf->start,
3348 buf->start + buf->len - 1, GFP_NOFS); 3659 buf->start + buf->len - 1, GFP_NOFS);
@@ -3351,6 +3662,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3351 buf->start + buf->len - 1, GFP_NOFS); 3662 buf->start + buf->len - 1, GFP_NOFS);
3352 } 3663 }
3353 trans->blocks_used++; 3664 trans->blocks_used++;
3665 /* this returns a buffer locked for blocking */
3354 return buf; 3666 return buf;
3355} 3667}
3356 3668
@@ -3379,7 +3691,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3379 return ERR_PTR(ret); 3691 return ERR_PTR(ret);
3380 } 3692 }
3381 3693
3382 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); 3694 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
3695 blocksize, level);
3383 return buf; 3696 return buf;
3384} 3697}
3385 3698
@@ -3388,36 +3701,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3388{ 3701{
3389 u64 leaf_owner; 3702 u64 leaf_owner;
3390 u64 leaf_generation; 3703 u64 leaf_generation;
3704 struct refsort *sorted;
3391 struct btrfs_key key; 3705 struct btrfs_key key;
3392 struct btrfs_file_extent_item *fi; 3706 struct btrfs_file_extent_item *fi;
3393 int i; 3707 int i;
3394 int nritems; 3708 int nritems;
3395 int ret; 3709 int ret;
3710 int refi = 0;
3711 int slot;
3396 3712
3397 BUG_ON(!btrfs_is_leaf(leaf)); 3713 BUG_ON(!btrfs_is_leaf(leaf));
3398 nritems = btrfs_header_nritems(leaf); 3714 nritems = btrfs_header_nritems(leaf);
3399 leaf_owner = btrfs_header_owner(leaf); 3715 leaf_owner = btrfs_header_owner(leaf);
3400 leaf_generation = btrfs_header_generation(leaf); 3716 leaf_generation = btrfs_header_generation(leaf);
3401 3717
3718 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3719 /* we do this loop twice. The first time we build a list
3720 * of the extents we have a reference on, then we sort the list
3721 * by bytenr. The second time around we actually do the
3722 * extent freeing.
3723 */
3402 for (i = 0; i < nritems; i++) { 3724 for (i = 0; i < nritems; i++) {
3403 u64 disk_bytenr; 3725 u64 disk_bytenr;
3404 cond_resched(); 3726 cond_resched();
3405 3727
3406 btrfs_item_key_to_cpu(leaf, &key, i); 3728 btrfs_item_key_to_cpu(leaf, &key, i);
3729
3730 /* only extents have references, skip everything else */
3407 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3731 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3408 continue; 3732 continue;
3733
3409 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3734 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3735
3736 /* inline extents live in the btree, they don't have refs */
3410 if (btrfs_file_extent_type(leaf, fi) == 3737 if (btrfs_file_extent_type(leaf, fi) ==
3411 BTRFS_FILE_EXTENT_INLINE) 3738 BTRFS_FILE_EXTENT_INLINE)
3412 continue; 3739 continue;
3413 /* 3740
3414 * FIXME make sure to insert a trans record that
3415 * repeats the snapshot del on crash
3416 */
3417 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3741 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3742
3743 /* holes don't have refs */
3418 if (disk_bytenr == 0) 3744 if (disk_bytenr == 0)
3419 continue; 3745 continue;
3420 3746
3747 sorted[refi].bytenr = disk_bytenr;
3748 sorted[refi].slot = i;
3749 refi++;
3750 }
3751
3752 if (refi == 0)
3753 goto out;
3754
3755 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3756
3757 for (i = 0; i < refi; i++) {
3758 u64 disk_bytenr;
3759
3760 disk_bytenr = sorted[i].bytenr;
3761 slot = sorted[i].slot;
3762
3763 cond_resched();
3764
3765 btrfs_item_key_to_cpu(leaf, &key, slot);
3766 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3767 continue;
3768
3769 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3770
3421 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3771 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3422 btrfs_file_extent_disk_num_bytes(leaf, fi), 3772 btrfs_file_extent_disk_num_bytes(leaf, fi),
3423 leaf->start, leaf_owner, leaf_generation, 3773 leaf->start, leaf_owner, leaf_generation,
@@ -3428,6 +3778,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3428 wake_up(&root->fs_info->transaction_throttle); 3778 wake_up(&root->fs_info->transaction_throttle);
3429 cond_resched(); 3779 cond_resched();
3430 } 3780 }
3781out:
3782 kfree(sorted);
3431 return 0; 3783 return 0;
3432} 3784}
3433 3785
@@ -3437,9 +3789,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3437{ 3789{
3438 int i; 3790 int i;
3439 int ret; 3791 int ret;
3440 struct btrfs_extent_info *info = ref->extents; 3792 struct btrfs_extent_info *info;
3793 struct refsort *sorted;
3794
3795 if (ref->nritems == 0)
3796 return 0;
3441 3797
3798 sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3442 for (i = 0; i < ref->nritems; i++) { 3799 for (i = 0; i < ref->nritems; i++) {
3800 sorted[i].bytenr = ref->extents[i].bytenr;
3801 sorted[i].slot = i;
3802 }
3803 sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3804
3805 /*
3806 * the items in the ref were sorted when the ref was inserted
3807 * into the ref cache, so this is already in order
3808 */
3809 for (i = 0; i < ref->nritems; i++) {
3810 info = ref->extents + sorted[i].slot;
3443 ret = __btrfs_free_extent(trans, root, info->bytenr, 3811 ret = __btrfs_free_extent(trans, root, info->bytenr,
3444 info->num_bytes, ref->bytenr, 3812 info->num_bytes, ref->bytenr,
3445 ref->owner, ref->generation, 3813 ref->owner, ref->generation,
@@ -3453,6 +3821,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3453 info++; 3821 info++;
3454 } 3822 }
3455 3823
3824 kfree(sorted);
3456 return 0; 3825 return 0;
3457} 3826}
3458 3827
@@ -3497,6 +3866,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3497} 3866}
3498 3867
3499/* 3868/*
3869 * this is used while deleting old snapshots, and it drops the refs
3870 * on a whole subtree starting from a level 1 node.
3871 *
3872 * The idea is to sort all the leaf pointers, and then drop the
3873 * ref on all the leaves in order. Most of the time the leaves
3874 * will have ref cache entries, so no leaf IOs will be required to
3875 * find the extents they have references on.
3876 *
3877 * For each leaf, any references it has are also dropped in order
3878 *
3879 * This ends up dropping the references in something close to optimal
3880 * order for reading and modifying the extent allocation tree.
3881 */
3882static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3883 struct btrfs_root *root,
3884 struct btrfs_path *path)
3885{
3886 u64 bytenr;
3887 u64 root_owner;
3888 u64 root_gen;
3889 struct extent_buffer *eb = path->nodes[1];
3890 struct extent_buffer *leaf;
3891 struct btrfs_leaf_ref *ref;
3892 struct refsort *sorted = NULL;
3893 int nritems = btrfs_header_nritems(eb);
3894 int ret;
3895 int i;
3896 int refi = 0;
3897 int slot = path->slots[1];
3898 u32 blocksize = btrfs_level_size(root, 0);
3899 u32 refs;
3900
3901 if (nritems == 0)
3902 goto out;
3903
3904 root_owner = btrfs_header_owner(eb);
3905 root_gen = btrfs_header_generation(eb);
3906 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3907
3908 /*
3909 * step one, sort all the leaf pointers so we don't scribble
3910 * randomly into the extent allocation tree
3911 */
3912 for (i = slot; i < nritems; i++) {
3913 sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3914 sorted[refi].slot = i;
3915 refi++;
3916 }
3917
3918 /*
3919 * nritems won't be zero, but if we're picking up drop_snapshot
3920 * after a crash, slot might be > 0, so double check things
3921 * just in case.
3922 */
3923 if (refi == 0)
3924 goto out;
3925
3926 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3927
3928 /*
3929 * the first loop frees everything the leaves point to
3930 */
3931 for (i = 0; i < refi; i++) {
3932 u64 ptr_gen;
3933
3934 bytenr = sorted[i].bytenr;
3935
3936 /*
3937 * check the reference count on this leaf. If it is > 1
3938 * we just decrement it below and don't update any
3939 * of the refs the leaf points to.
3940 */
3941 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3942 BUG_ON(ret);
3943 if (refs != 1)
3944 continue;
3945
3946 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3947
3948 /*
3949 * the leaf only had one reference, which means the
3950 * only thing pointing to this leaf is the snapshot
3951 * we're deleting. It isn't possible for the reference
3952 * count to increase again later
3953 *
3954 * The reference cache is checked for the leaf,
3955 * and if found we'll be able to drop any refs held by
3956 * the leaf without needing to read it in.
3957 */
3958 ref = btrfs_lookup_leaf_ref(root, bytenr);
3959 if (ref && ref->generation != ptr_gen) {
3960 btrfs_free_leaf_ref(root, ref);
3961 ref = NULL;
3962 }
3963 if (ref) {
3964 ret = cache_drop_leaf_ref(trans, root, ref);
3965 BUG_ON(ret);
3966 btrfs_remove_leaf_ref(root, ref);
3967 btrfs_free_leaf_ref(root, ref);
3968 } else {
3969 /*
3970 * the leaf wasn't in the reference cache, so
3971 * we have to read it.
3972 */
3973 leaf = read_tree_block(root, bytenr, blocksize,
3974 ptr_gen);
3975 ret = btrfs_drop_leaf_ref(trans, root, leaf);
3976 BUG_ON(ret);
3977 free_extent_buffer(leaf);
3978 }
3979 atomic_inc(&root->fs_info->throttle_gen);
3980 wake_up(&root->fs_info->transaction_throttle);
3981 cond_resched();
3982 }
3983
3984 /*
3985 * run through the loop again to free the refs on the leaves.
3986 * This is faster than doing it in the loop above because
3987 * the leaves are likely to be clustered together. We end up
3988 * working in nice chunks on the extent allocation tree.
3989 */
3990 for (i = 0; i < refi; i++) {
3991 bytenr = sorted[i].bytenr;
3992 ret = __btrfs_free_extent(trans, root, bytenr,
3993 blocksize, eb->start,
3994 root_owner, root_gen, 0, 1);
3995 BUG_ON(ret);
3996
3997 atomic_inc(&root->fs_info->throttle_gen);
3998 wake_up(&root->fs_info->transaction_throttle);
3999 cond_resched();
4000 }
4001out:
4002 kfree(sorted);
4003
4004 /*
4005 * update the path to show we've processed the entire level 1
4006 * node. This will get saved into the root's drop_snapshot_progress
4007 * field so these drops are not repeated again if this transaction
4008 * commits.
4009 */
4010 path->slots[1] = nritems;
4011 return 0;
4012}
4013
4014/*
3500 * helper function for drop_snapshot, this walks down the tree dropping ref 4015 * helper function for drop_snapshot, this walks down the tree dropping ref
3501 * counts as it goes. 4016 * counts as it goes.
3502 */ 4017 */
@@ -3511,7 +4026,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3511 struct extent_buffer *next; 4026 struct extent_buffer *next;
3512 struct extent_buffer *cur; 4027 struct extent_buffer *cur;
3513 struct extent_buffer *parent; 4028 struct extent_buffer *parent;
3514 struct btrfs_leaf_ref *ref;
3515 u32 blocksize; 4029 u32 blocksize;
3516 int ret; 4030 int ret;
3517 u32 refs; 4031 u32 refs;
@@ -3538,17 +4052,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3538 if (path->slots[*level] >= 4052 if (path->slots[*level] >=
3539 btrfs_header_nritems(cur)) 4053 btrfs_header_nritems(cur))
3540 break; 4054 break;
4055
4056 /* the new code goes down to level 1 and does all the
4057 * leaves pointed to that node in bulk. So, this check
4058 * for level 0 will always be false.
4059 *
4060 * But, the disk format allows the drop_snapshot_progress
4061 * field in the root to leave things in a state where
4062 * a leaf will need cleaning up here. If someone crashes
4063 * with the old code and then boots with the new code,
4064 * we might find a leaf here.
4065 */
3541 if (*level == 0) { 4066 if (*level == 0) {
3542 ret = btrfs_drop_leaf_ref(trans, root, cur); 4067 ret = btrfs_drop_leaf_ref(trans, root, cur);
3543 BUG_ON(ret); 4068 BUG_ON(ret);
3544 break; 4069 break;
3545 } 4070 }
4071
4072 /*
4073 * once we get to level one, process the whole node
4074 * at once, including everything below it.
4075 */
4076 if (*level == 1) {
4077 ret = drop_level_one_refs(trans, root, path);
4078 BUG_ON(ret);
4079 break;
4080 }
4081
3546 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 4082 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3547 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 4083 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3548 blocksize = btrfs_level_size(root, *level - 1); 4084 blocksize = btrfs_level_size(root, *level - 1);
3549 4085
3550 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 4086 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3551 BUG_ON(ret); 4087 BUG_ON(ret);
4088
4089 /*
4090 * if there is more than one reference, we don't need
4091 * to read that node to drop any references it has. We
4092 * just drop the ref we hold on that node and move on to the
4093 * next slot in this level.
4094 */
3552 if (refs != 1) { 4095 if (refs != 1) {
3553 parent = path->nodes[*level]; 4096 parent = path->nodes[*level];
3554 root_owner = btrfs_header_owner(parent); 4097 root_owner = btrfs_header_owner(parent);
@@ -3567,46 +4110,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3567 4110
3568 continue; 4111 continue;
3569 } 4112 }
4113
3570 /* 4114 /*
3571 * at this point, we have a single ref, and since the 4115 * we need to keep freeing things in the next level down.
3572 * only place referencing this extent is a dead root 4116 * read the block and loop around to process it
3573 * the reference count should never go higher.
3574 * So, we don't need to check it again
3575 */ 4117 */
3576 if (*level == 1) { 4118 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3577 ref = btrfs_lookup_leaf_ref(root, bytenr);
3578 if (ref && ref->generation != ptr_gen) {
3579 btrfs_free_leaf_ref(root, ref);
3580 ref = NULL;
3581 }
3582 if (ref) {
3583 ret = cache_drop_leaf_ref(trans, root, ref);
3584 BUG_ON(ret);
3585 btrfs_remove_leaf_ref(root, ref);
3586 btrfs_free_leaf_ref(root, ref);
3587 *level = 0;
3588 break;
3589 }
3590 }
3591 next = btrfs_find_tree_block(root, bytenr, blocksize);
3592 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3593 free_extent_buffer(next);
3594
3595 next = read_tree_block(root, bytenr, blocksize,
3596 ptr_gen);
3597 cond_resched();
3598#if 0
3599 /*
3600 * this is a debugging check and can go away
3601 * the ref should never go all the way down to 1
3602 * at this point
3603 */
3604 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3605 &refs);
3606 BUG_ON(ret);
3607 WARN_ON(refs != 1);
3608#endif
3609 }
3610 WARN_ON(*level <= 0); 4119 WARN_ON(*level <= 0);
3611 if (path->nodes[*level-1]) 4120 if (path->nodes[*level-1])
3612 free_extent_buffer(path->nodes[*level-1]); 4121 free_extent_buffer(path->nodes[*level-1]);
@@ -3631,11 +4140,16 @@ out:
3631 root_owner = btrfs_header_owner(parent); 4140 root_owner = btrfs_header_owner(parent);
3632 root_gen = btrfs_header_generation(parent); 4141 root_gen = btrfs_header_generation(parent);
3633 4142
4143 /*
4144 * cleanup and free the reference on the last node
4145 * we processed
4146 */
3634 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 4147 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3635 parent->start, root_owner, root_gen, 4148 parent->start, root_owner, root_gen,
3636 *level, 1); 4149 *level, 1);
3637 free_extent_buffer(path->nodes[*level]); 4150 free_extent_buffer(path->nodes[*level]);
3638 path->nodes[*level] = NULL; 4151 path->nodes[*level] = NULL;
4152
3639 *level += 1; 4153 *level += 1;
3640 BUG_ON(ret); 4154 BUG_ON(ret);
3641 4155
@@ -3687,6 +4201,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3687 4201
3688 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 4202 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3689 btrfs_tree_lock(next); 4203 btrfs_tree_lock(next);
4204 btrfs_set_lock_blocking(next);
3690 4205
3691 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 4206 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3692 &refs); 4207 &refs);
@@ -3754,6 +4269,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3754 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 4269 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3755 struct extent_buffer *node; 4270 struct extent_buffer *node;
3756 struct btrfs_disk_key disk_key; 4271 struct btrfs_disk_key disk_key;
4272
4273 /*
4274 * there is more work to do in this level.
4275 * Update the drop_progress marker to reflect
4276 * the work we've done so far, and then bump
4277 * the slot number
4278 */
3757 node = path->nodes[i]; 4279 node = path->nodes[i];
3758 path->slots[i]++; 4280 path->slots[i]++;
3759 *level = i; 4281 *level = i;
@@ -3765,6 +4287,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3765 return 0; 4287 return 0;
3766 } else { 4288 } else {
3767 struct extent_buffer *parent; 4289 struct extent_buffer *parent;
4290
4291 /*
4292 * this whole node is done, free our reference
4293 * on it and go up one level
4294 */
3768 if (path->nodes[*level] == root->node) 4295 if (path->nodes[*level] == root->node)
3769 parent = path->nodes[*level]; 4296 parent = path->nodes[*level];
3770 else 4297 else
@@ -4444,7 +4971,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4444 u64 lock_end = 0; 4971 u64 lock_end = 0;
4445 u64 num_bytes; 4972 u64 num_bytes;
4446 u64 ext_offset; 4973 u64 ext_offset;
4447 u64 first_pos; 4974 u64 search_end = (u64)-1;
4448 u32 nritems; 4975 u32 nritems;
4449 int nr_scaned = 0; 4976 int nr_scaned = 0;
4450 int extent_locked = 0; 4977 int extent_locked = 0;
@@ -4452,7 +4979,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4452 int ret; 4979 int ret;
4453 4980
4454 memcpy(&key, leaf_key, sizeof(key)); 4981 memcpy(&key, leaf_key, sizeof(key));
4455 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4456 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 4982 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4457 if (key.objectid < ref_path->owner_objectid || 4983 if (key.objectid < ref_path->owner_objectid ||
4458 (key.objectid == ref_path->owner_objectid && 4984 (key.objectid == ref_path->owner_objectid &&
@@ -4501,7 +5027,7 @@ next:
4501 if ((key.objectid > ref_path->owner_objectid) || 5027 if ((key.objectid > ref_path->owner_objectid) ||
4502 (key.objectid == ref_path->owner_objectid && 5028 (key.objectid == ref_path->owner_objectid &&
4503 key.type > BTRFS_EXTENT_DATA_KEY) || 5029 key.type > BTRFS_EXTENT_DATA_KEY) ||
4504 (key.offset >= first_pos + extent_key->offset)) 5030 key.offset >= search_end)
4505 break; 5031 break;
4506 } 5032 }
4507 5033
@@ -4534,8 +5060,10 @@ next:
4534 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 5060 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4535 ext_offset = btrfs_file_extent_offset(leaf, fi); 5061 ext_offset = btrfs_file_extent_offset(leaf, fi);
4536 5062
4537 if (first_pos > key.offset - ext_offset) 5063 if (search_end == (u64)-1) {
4538 first_pos = key.offset - ext_offset; 5064 search_end = key.offset - ext_offset +
5065 btrfs_file_extent_ram_bytes(leaf, fi);
5066 }
4539 5067
4540 if (!extent_locked) { 5068 if (!extent_locked) {
4541 lock_start = key.offset; 5069 lock_start = key.offset;
@@ -4724,7 +5252,7 @@ next:
4724 } 5252 }
4725skip: 5253skip:
4726 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && 5254 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4727 key.offset >= first_pos + extent_key->offset) 5255 key.offset >= search_end)
4728 break; 5256 break;
4729 5257
4730 cond_resched(); 5258 cond_resched();
@@ -4778,6 +5306,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4778 ref->bytenr = buf->start; 5306 ref->bytenr = buf->start;
4779 ref->owner = btrfs_header_owner(buf); 5307 ref->owner = btrfs_header_owner(buf);
4780 ref->generation = btrfs_header_generation(buf); 5308 ref->generation = btrfs_header_generation(buf);
5309
4781 ret = btrfs_add_leaf_ref(root, ref, 0); 5310 ret = btrfs_add_leaf_ref(root, ref, 0);
4782 WARN_ON(ret); 5311 WARN_ON(ret);
4783 btrfs_free_leaf_ref(root, ref); 5312 btrfs_free_leaf_ref(root, ref);
@@ -5351,7 +5880,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5351 prev_block = block_start; 5880 prev_block = block_start;
5352 } 5881 }
5353 5882
5883 mutex_lock(&extent_root->fs_info->trans_mutex);
5354 btrfs_record_root_in_trans(found_root); 5884 btrfs_record_root_in_trans(found_root);
5885 mutex_unlock(&extent_root->fs_info->trans_mutex);
5355 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 5886 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5356 /* 5887 /*
5357 * try to update data extent references while 5888 * try to update data extent references while
@@ -5957,9 +6488,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5957 path = btrfs_alloc_path(); 6488 path = btrfs_alloc_path();
5958 BUG_ON(!path); 6489 BUG_ON(!path);
5959 6490
5960 btrfs_remove_free_space_cache(block_group); 6491 spin_lock(&root->fs_info->block_group_cache_lock);
5961 rb_erase(&block_group->cache_node, 6492 rb_erase(&block_group->cache_node,
5962 &root->fs_info->block_group_cache_tree); 6493 &root->fs_info->block_group_cache_tree);
6494 spin_unlock(&root->fs_info->block_group_cache_lock);
6495 btrfs_remove_free_space_cache(block_group);
5963 down_write(&block_group->space_info->groups_sem); 6496 down_write(&block_group->space_info->groups_sem);
5964 list_del(&block_group->list); 6497 list_del(&block_group->list);
5965 up_write(&block_group->space_info->groups_sem); 6498 up_write(&block_group->space_info->groups_sem);