aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c818
1 files changed, 695 insertions, 123 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 293da650873..fefe83ad205 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,7 +19,8 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/version.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h>
23#include "compat.h" 24#include "compat.h"
24#include "hash.h" 25#include "hash.h"
25#include "crc32c.h" 26#include "crc32c.h"
@@ -30,7 +31,6 @@
30#include "volumes.h" 31#include "volumes.h"
31#include "locking.h" 32#include "locking.h"
32#include "ref-cache.h" 33#include "ref-cache.h"
33#include "compat.h"
34 34
35#define PENDING_EXTENT_INSERT 0 35#define PENDING_EXTENT_INSERT 0
36#define PENDING_EXTENT_DELETE 1 36#define PENDING_EXTENT_DELETE 1
@@ -61,6 +61,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
61 u64 bytenr, u64 num_bytes, int alloc, 61 u64 bytenr, u64 num_bytes, int alloc,
62 int mark_free); 62 int mark_free);
63 63
64static int do_chunk_alloc(struct btrfs_trans_handle *trans,
65 struct btrfs_root *extent_root, u64 alloc_bytes,
66 u64 flags, int force);
67
64static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 68static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65{ 69{
66 return (cache->flags & bits) == bits; 70 return (cache->flags & bits) == bits;
@@ -326,16 +330,34 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
326 u64 flags) 330 u64 flags)
327{ 331{
328 struct list_head *head = &info->space_info; 332 struct list_head *head = &info->space_info;
329 struct list_head *cur;
330 struct btrfs_space_info *found; 333 struct btrfs_space_info *found;
331 list_for_each(cur, head) { 334
332 found = list_entry(cur, struct btrfs_space_info, list); 335 rcu_read_lock();
333 if (found->flags == flags) 336 list_for_each_entry_rcu(found, head, list) {
337 if (found->flags == flags) {
338 rcu_read_unlock();
334 return found; 339 return found;
340 }
335 } 341 }
342 rcu_read_unlock();
336 return NULL; 343 return NULL;
337} 344}
338 345
346/*
347 * after adding space to the filesystem, we need to clear the full flags
348 * on all the space infos.
349 */
350void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
351{
352 struct list_head *head = &info->space_info;
353 struct btrfs_space_info *found;
354
355 rcu_read_lock();
356 list_for_each_entry_rcu(found, head, list)
357 found->full = 0;
358 rcu_read_unlock();
359}
360
339static u64 div_factor(u64 num, int factor) 361static u64 div_factor(u64 num, int factor)
340{ 362{
341 if (factor == 10) 363 if (factor == 10)
@@ -1326,8 +1348,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1326int btrfs_extent_post_op(struct btrfs_trans_handle *trans, 1348int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root) 1349 struct btrfs_root *root)
1328{ 1350{
1329 finish_current_insert(trans, root->fs_info->extent_root, 1); 1351 u64 start;
1330 del_pending_extents(trans, root->fs_info->extent_root, 1); 1352 u64 end;
1353 int ret;
1354
1355 while(1) {
1356 finish_current_insert(trans, root->fs_info->extent_root, 1);
1357 del_pending_extents(trans, root->fs_info->extent_root, 1);
1358
1359 /* is there more work to do? */
1360 ret = find_first_extent_bit(&root->fs_info->pending_del,
1361 0, &start, &end, EXTENT_WRITEBACK);
1362 if (!ret)
1363 continue;
1364 ret = find_first_extent_bit(&root->fs_info->extent_ins,
1365 0, &start, &end, EXTENT_WRITEBACK);
1366 if (!ret)
1367 continue;
1368 break;
1369 }
1331 return 0; 1370 return 0;
1332} 1371}
1333 1372
@@ -1525,15 +1564,55 @@ out:
1525 return ret; 1564 return ret;
1526} 1565}
1527 1566
1528int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1567/* when a block goes through cow, we update the reference counts of
1529 struct extent_buffer *orig_buf, struct extent_buffer *buf, 1568 * everything that block points to. The internal pointers of the block
1530 u32 *nr_extents) 1569 * can be in just about any order, and it is likely to have clusters of
1570 * things that are close together and clusters of things that are not.
1571 *
1572 * To help reduce the seeks that come with updating all of these reference
1573 * counts, sort them by byte number before actual updates are done.
1574 *
1575 * struct refsort is used to match byte number to slot in the btree block.
1576 * we sort based on the byte number and then use the slot to actually
1577 * find the item.
1578 *
1579 * struct refsort is smaller than strcut btrfs_item and smaller than
1580 * struct btrfs_key_ptr. Since we're currently limited to the page size
1581 * for a btree block, there's no way for a kmalloc of refsorts for a
1582 * single node to be bigger than a page.
1583 */
1584struct refsort {
1585 u64 bytenr;
1586 u32 slot;
1587};
1588
1589/*
1590 * for passing into sort()
1591 */
1592static int refsort_cmp(const void *a_void, const void *b_void)
1593{
1594 const struct refsort *a = a_void;
1595 const struct refsort *b = b_void;
1596
1597 if (a->bytenr < b->bytenr)
1598 return -1;
1599 if (a->bytenr > b->bytenr)
1600 return 1;
1601 return 0;
1602}
1603
1604
1605noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1606 struct btrfs_root *root,
1607 struct extent_buffer *orig_buf,
1608 struct extent_buffer *buf, u32 *nr_extents)
1531{ 1609{
1532 u64 bytenr; 1610 u64 bytenr;
1533 u64 ref_root; 1611 u64 ref_root;
1534 u64 orig_root; 1612 u64 orig_root;
1535 u64 ref_generation; 1613 u64 ref_generation;
1536 u64 orig_generation; 1614 u64 orig_generation;
1615 struct refsort *sorted;
1537 u32 nritems; 1616 u32 nritems;
1538 u32 nr_file_extents = 0; 1617 u32 nr_file_extents = 0;
1539 struct btrfs_key key; 1618 struct btrfs_key key;
@@ -1542,6 +1621,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1542 int level; 1621 int level;
1543 int ret = 0; 1622 int ret = 0;
1544 int faili = 0; 1623 int faili = 0;
1624 int refi = 0;
1625 int slot;
1545 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 1626 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1546 u64, u64, u64, u64, u64, u64, u64, u64); 1627 u64, u64, u64, u64, u64, u64, u64, u64);
1547 1628
@@ -1553,6 +1634,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1553 nritems = btrfs_header_nritems(buf); 1634 nritems = btrfs_header_nritems(buf);
1554 level = btrfs_header_level(buf); 1635 level = btrfs_header_level(buf);
1555 1636
1637 sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
1638 BUG_ON(!sorted);
1639
1556 if (root->ref_cows) { 1640 if (root->ref_cows) {
1557 process_func = __btrfs_inc_extent_ref; 1641 process_func = __btrfs_inc_extent_ref;
1558 } else { 1642 } else {
@@ -1565,6 +1649,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1565 process_func = __btrfs_update_extent_ref; 1649 process_func = __btrfs_update_extent_ref;
1566 } 1650 }
1567 1651
1652 /*
1653 * we make two passes through the items. In the first pass we
1654 * only record the byte number and slot. Then we sort based on
1655 * byte number and do the actual work based on the sorted results
1656 */
1568 for (i = 0; i < nritems; i++) { 1657 for (i = 0; i < nritems; i++) {
1569 cond_resched(); 1658 cond_resched();
1570 if (level == 0) { 1659 if (level == 0) {
@@ -1581,6 +1670,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1581 continue; 1670 continue;
1582 1671
1583 nr_file_extents++; 1672 nr_file_extents++;
1673 sorted[refi].bytenr = bytenr;
1674 sorted[refi].slot = i;
1675 refi++;
1676 } else {
1677 bytenr = btrfs_node_blockptr(buf, i);
1678 sorted[refi].bytenr = bytenr;
1679 sorted[refi].slot = i;
1680 refi++;
1681 }
1682 }
1683 /*
1684 * if refi == 0, we didn't actually put anything into the sorted
1685 * array and we're done
1686 */
1687 if (refi == 0)
1688 goto out;
1689
1690 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1691
1692 for (i = 0; i < refi; i++) {
1693 cond_resched();
1694 slot = sorted[i].slot;
1695 bytenr = sorted[i].bytenr;
1696
1697 if (level == 0) {
1698 btrfs_item_key_to_cpu(buf, &key, slot);
1584 1699
1585 ret = process_func(trans, root, bytenr, 1700 ret = process_func(trans, root, bytenr,
1586 orig_buf->start, buf->start, 1701 orig_buf->start, buf->start,
@@ -1589,25 +1704,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1589 key.objectid); 1704 key.objectid);
1590 1705
1591 if (ret) { 1706 if (ret) {
1592 faili = i; 1707 faili = slot;
1593 WARN_ON(1); 1708 WARN_ON(1);
1594 goto fail; 1709 goto fail;
1595 } 1710 }
1596 } else { 1711 } else {
1597 bytenr = btrfs_node_blockptr(buf, i);
1598 ret = process_func(trans, root, bytenr, 1712 ret = process_func(trans, root, bytenr,
1599 orig_buf->start, buf->start, 1713 orig_buf->start, buf->start,
1600 orig_root, ref_root, 1714 orig_root, ref_root,
1601 orig_generation, ref_generation, 1715 orig_generation, ref_generation,
1602 level - 1); 1716 level - 1);
1603 if (ret) { 1717 if (ret) {
1604 faili = i; 1718 faili = slot;
1605 WARN_ON(1); 1719 WARN_ON(1);
1606 goto fail; 1720 goto fail;
1607 } 1721 }
1608 } 1722 }
1609 } 1723 }
1610out: 1724out:
1725 kfree(sorted);
1611 if (nr_extents) { 1726 if (nr_extents) {
1612 if (level == 0) 1727 if (level == 0)
1613 *nr_extents = nr_file_extents; 1728 *nr_extents = nr_file_extents;
@@ -1616,6 +1731,7 @@ out:
1616 } 1731 }
1617 return 0; 1732 return 0;
1618fail: 1733fail:
1734 kfree(sorted);
1619 WARN_ON(1); 1735 WARN_ON(1);
1620 return ret; 1736 return ret;
1621} 1737}
@@ -1808,7 +1924,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1808 if (!found) 1924 if (!found)
1809 return -ENOMEM; 1925 return -ENOMEM;
1810 1926
1811 list_add(&found->list, &info->space_info);
1812 INIT_LIST_HEAD(&found->block_groups); 1927 INIT_LIST_HEAD(&found->block_groups);
1813 init_rwsem(&found->groups_sem); 1928 init_rwsem(&found->groups_sem);
1814 spin_lock_init(&found->lock); 1929 spin_lock_init(&found->lock);
@@ -1818,9 +1933,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1818 found->bytes_pinned = 0; 1933 found->bytes_pinned = 0;
1819 found->bytes_reserved = 0; 1934 found->bytes_reserved = 0;
1820 found->bytes_readonly = 0; 1935 found->bytes_readonly = 0;
1936 found->bytes_delalloc = 0;
1821 found->full = 0; 1937 found->full = 0;
1822 found->force_alloc = 0; 1938 found->force_alloc = 0;
1823 *space_info = found; 1939 *space_info = found;
1940 list_add_rcu(&found->list, &info->space_info);
1824 return 0; 1941 return 0;
1825} 1942}
1826 1943
@@ -1881,6 +1998,233 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1881 return flags; 1998 return flags;
1882} 1999}
1883 2000
2001static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
2002{
2003 struct btrfs_fs_info *info = root->fs_info;
2004 u64 alloc_profile;
2005
2006 if (data) {
2007 alloc_profile = info->avail_data_alloc_bits &
2008 info->data_alloc_profile;
2009 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2010 } else if (root == root->fs_info->chunk_root) {
2011 alloc_profile = info->avail_system_alloc_bits &
2012 info->system_alloc_profile;
2013 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2014 } else {
2015 alloc_profile = info->avail_metadata_alloc_bits &
2016 info->metadata_alloc_profile;
2017 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2018 }
2019
2020 return btrfs_reduce_alloc_profile(root, data);
2021}
2022
2023void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2024{
2025 u64 alloc_target;
2026
2027 alloc_target = btrfs_get_alloc_profile(root, 1);
2028 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2029 alloc_target);
2030}
2031
2032/*
2033 * for now this just makes sure we have at least 5% of our metadata space free
2034 * for use.
2035 */
2036int btrfs_check_metadata_free_space(struct btrfs_root *root)
2037{
2038 struct btrfs_fs_info *info = root->fs_info;
2039 struct btrfs_space_info *meta_sinfo;
2040 u64 alloc_target, thresh;
2041 int committed = 0, ret;
2042
2043 /* get the space info for where the metadata will live */
2044 alloc_target = btrfs_get_alloc_profile(root, 0);
2045 meta_sinfo = __find_space_info(info, alloc_target);
2046
2047again:
2048 spin_lock(&meta_sinfo->lock);
2049 if (!meta_sinfo->full)
2050 thresh = meta_sinfo->total_bytes * 80;
2051 else
2052 thresh = meta_sinfo->total_bytes * 95;
2053
2054 do_div(thresh, 100);
2055
2056 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2057 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
2058 struct btrfs_trans_handle *trans;
2059 if (!meta_sinfo->full) {
2060 meta_sinfo->force_alloc = 1;
2061 spin_unlock(&meta_sinfo->lock);
2062
2063 trans = btrfs_start_transaction(root, 1);
2064 if (!trans)
2065 return -ENOMEM;
2066
2067 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2068 2 * 1024 * 1024, alloc_target, 0);
2069 btrfs_end_transaction(trans, root);
2070 goto again;
2071 }
2072 spin_unlock(&meta_sinfo->lock);
2073
2074 if (!committed) {
2075 committed = 1;
2076 trans = btrfs_join_transaction(root, 1);
2077 if (!trans)
2078 return -ENOMEM;
2079 ret = btrfs_commit_transaction(trans, root);
2080 if (ret)
2081 return ret;
2082 goto again;
2083 }
2084 return -ENOSPC;
2085 }
2086 spin_unlock(&meta_sinfo->lock);
2087
2088 return 0;
2089}
2090
2091/*
2092 * This will check the space that the inode allocates from to make sure we have
2093 * enough space for bytes.
2094 */
2095int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2096 u64 bytes)
2097{
2098 struct btrfs_space_info *data_sinfo;
2099 int ret = 0, committed = 0;
2100
2101 /* make sure bytes are sectorsize aligned */
2102 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2103
2104 data_sinfo = BTRFS_I(inode)->space_info;
2105again:
2106 /* make sure we have enough space to handle the data first */
2107 spin_lock(&data_sinfo->lock);
2108 if (data_sinfo->total_bytes - data_sinfo->bytes_used -
2109 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
2110 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
2111 data_sinfo->bytes_may_use < bytes) {
2112 struct btrfs_trans_handle *trans;
2113
2114 /*
2115 * if we don't have enough free bytes in this space then we need
2116 * to alloc a new chunk.
2117 */
2118 if (!data_sinfo->full) {
2119 u64 alloc_target;
2120
2121 data_sinfo->force_alloc = 1;
2122 spin_unlock(&data_sinfo->lock);
2123
2124 alloc_target = btrfs_get_alloc_profile(root, 1);
2125 trans = btrfs_start_transaction(root, 1);
2126 if (!trans)
2127 return -ENOMEM;
2128
2129 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2130 bytes + 2 * 1024 * 1024,
2131 alloc_target, 0);
2132 btrfs_end_transaction(trans, root);
2133 if (ret)
2134 return ret;
2135 goto again;
2136 }
2137 spin_unlock(&data_sinfo->lock);
2138
2139 /* commit the current transaction and try again */
2140 if (!committed) {
2141 committed = 1;
2142 trans = btrfs_join_transaction(root, 1);
2143 if (!trans)
2144 return -ENOMEM;
2145 ret = btrfs_commit_transaction(trans, root);
2146 if (ret)
2147 return ret;
2148 goto again;
2149 }
2150
2151 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
2152 ", %llu bytes_used, %llu bytes_reserved, "
2153 "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
2154 "%llu total\n", bytes, data_sinfo->bytes_delalloc,
2155 data_sinfo->bytes_used, data_sinfo->bytes_reserved,
2156 data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
2157 data_sinfo->bytes_may_use, data_sinfo->total_bytes);
2158 return -ENOSPC;
2159 }
2160 data_sinfo->bytes_may_use += bytes;
2161 BTRFS_I(inode)->reserved_bytes += bytes;
2162 spin_unlock(&data_sinfo->lock);
2163
2164 return btrfs_check_metadata_free_space(root);
2165}
2166
2167/*
2168 * if there was an error for whatever reason after calling
2169 * btrfs_check_data_free_space, call this so we can cleanup the counters.
2170 */
2171void btrfs_free_reserved_data_space(struct btrfs_root *root,
2172 struct inode *inode, u64 bytes)
2173{
2174 struct btrfs_space_info *data_sinfo;
2175
2176 /* make sure bytes are sectorsize aligned */
2177 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2178
2179 data_sinfo = BTRFS_I(inode)->space_info;
2180 spin_lock(&data_sinfo->lock);
2181 data_sinfo->bytes_may_use -= bytes;
2182 BTRFS_I(inode)->reserved_bytes -= bytes;
2183 spin_unlock(&data_sinfo->lock);
2184}
2185
2186/* called when we are adding a delalloc extent to the inode's io_tree */
2187void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
2188 u64 bytes)
2189{
2190 struct btrfs_space_info *data_sinfo;
2191
2192 /* get the space info for where this inode will be storing its data */
2193 data_sinfo = BTRFS_I(inode)->space_info;
2194
2195 /* make sure we have enough space to handle the data first */
2196 spin_lock(&data_sinfo->lock);
2197 data_sinfo->bytes_delalloc += bytes;
2198
2199 /*
2200 * we are adding a delalloc extent without calling
2201 * btrfs_check_data_free_space first. This happens on a weird
2202 * writepage condition, but shouldn't hurt our accounting
2203 */
2204 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
2205 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
2206 BTRFS_I(inode)->reserved_bytes = 0;
2207 } else {
2208 data_sinfo->bytes_may_use -= bytes;
2209 BTRFS_I(inode)->reserved_bytes -= bytes;
2210 }
2211
2212 spin_unlock(&data_sinfo->lock);
2213}
2214
2215/* called when we are clearing an delalloc extent from the inode's io_tree */
2216void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
2217 u64 bytes)
2218{
2219 struct btrfs_space_info *info;
2220
2221 info = BTRFS_I(inode)->space_info;
2222
2223 spin_lock(&info->lock);
2224 info->bytes_delalloc -= bytes;
2225 spin_unlock(&info->lock);
2226}
2227
1884static int do_chunk_alloc(struct btrfs_trans_handle *trans, 2228static int do_chunk_alloc(struct btrfs_trans_handle *trans,
1885 struct btrfs_root *extent_root, u64 alloc_bytes, 2229 struct btrfs_root *extent_root, u64 alloc_bytes,
1886 u64 flags, int force) 2230 u64 flags, int force)
@@ -2137,13 +2481,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
2137 u64 end; 2481 u64 end;
2138 u64 priv; 2482 u64 priv;
2139 u64 search = 0; 2483 u64 search = 0;
2140 u64 skipped = 0;
2141 struct btrfs_fs_info *info = extent_root->fs_info; 2484 struct btrfs_fs_info *info = extent_root->fs_info;
2142 struct btrfs_path *path; 2485 struct btrfs_path *path;
2143 struct pending_extent_op *extent_op, *tmp; 2486 struct pending_extent_op *extent_op, *tmp;
2144 struct list_head insert_list, update_list; 2487 struct list_head insert_list, update_list;
2145 int ret; 2488 int ret;
2146 int num_inserts = 0, max_inserts; 2489 int num_inserts = 0, max_inserts, restart = 0;
2147 2490
2148 path = btrfs_alloc_path(); 2491 path = btrfs_alloc_path();
2149 INIT_LIST_HEAD(&insert_list); 2492 INIT_LIST_HEAD(&insert_list);
@@ -2159,18 +2502,19 @@ again:
2159 ret = find_first_extent_bit(&info->extent_ins, search, &start, 2502 ret = find_first_extent_bit(&info->extent_ins, search, &start,
2160 &end, EXTENT_WRITEBACK); 2503 &end, EXTENT_WRITEBACK);
2161 if (ret) { 2504 if (ret) {
2162 if (skipped && all && !num_inserts) { 2505 if (restart && !num_inserts &&
2163 skipped = 0; 2506 list_empty(&update_list)) {
2507 restart = 0;
2164 search = 0; 2508 search = 0;
2165 continue; 2509 continue;
2166 } 2510 }
2167 mutex_unlock(&info->extent_ins_mutex);
2168 break; 2511 break;
2169 } 2512 }
2170 2513
2171 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); 2514 ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2172 if (!ret) { 2515 if (!ret) {
2173 skipped = 1; 2516 if (all)
2517 restart = 1;
2174 search = end + 1; 2518 search = end + 1;
2175 if (need_resched()) { 2519 if (need_resched()) {
2176 mutex_unlock(&info->extent_ins_mutex); 2520 mutex_unlock(&info->extent_ins_mutex);
@@ -2189,7 +2533,7 @@ again:
2189 list_add_tail(&extent_op->list, &insert_list); 2533 list_add_tail(&extent_op->list, &insert_list);
2190 search = end + 1; 2534 search = end + 1;
2191 if (num_inserts == max_inserts) { 2535 if (num_inserts == max_inserts) {
2192 mutex_unlock(&info->extent_ins_mutex); 2536 restart = 1;
2193 break; 2537 break;
2194 } 2538 }
2195 } else if (extent_op->type == PENDING_BACKREF_UPDATE) { 2539 } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
@@ -2205,7 +2549,6 @@ again:
2205 * somebody marked this thing for deletion then just unlock it and be 2549 * somebody marked this thing for deletion then just unlock it and be
2206 * done, the free_extents will handle it 2550 * done, the free_extents will handle it
2207 */ 2551 */
2208 mutex_lock(&info->extent_ins_mutex);
2209 list_for_each_entry_safe(extent_op, tmp, &update_list, list) { 2552 list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2210 clear_extent_bits(&info->extent_ins, extent_op->bytenr, 2553 clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2211 extent_op->bytenr + extent_op->num_bytes - 1, 2554 extent_op->bytenr + extent_op->num_bytes - 1,
@@ -2227,6 +2570,10 @@ again:
2227 if (!list_empty(&update_list)) { 2570 if (!list_empty(&update_list)) {
2228 ret = update_backrefs(trans, extent_root, path, &update_list); 2571 ret = update_backrefs(trans, extent_root, path, &update_list);
2229 BUG_ON(ret); 2572 BUG_ON(ret);
2573
2574 /* we may have COW'ed new blocks, so lets start over */
2575 if (all)
2576 restart = 1;
2230 } 2577 }
2231 2578
2232 /* 2579 /*
@@ -2234,9 +2581,9 @@ again:
2234 * need to make sure everything is cleaned then reset everything and 2581 * need to make sure everything is cleaned then reset everything and
2235 * go back to the beginning 2582 * go back to the beginning
2236 */ 2583 */
2237 if (!num_inserts && all && skipped) { 2584 if (!num_inserts && restart) {
2238 search = 0; 2585 search = 0;
2239 skipped = 0; 2586 restart = 0;
2240 INIT_LIST_HEAD(&update_list); 2587 INIT_LIST_HEAD(&update_list);
2241 INIT_LIST_HEAD(&insert_list); 2588 INIT_LIST_HEAD(&insert_list);
2242 goto again; 2589 goto again;
@@ -2293,27 +2640,19 @@ again:
2293 BUG_ON(ret); 2640 BUG_ON(ret);
2294 2641
2295 /* 2642 /*
2296 * if we broke out of the loop in order to insert stuff because we hit 2643 * if restart is set for whatever reason we need to go back and start
2297 * the maximum number of inserts at a time we can handle, then loop 2644 * searching through the pending list again.
2298 * back and pick up where we left off 2645 *
2299 */ 2646 * We just inserted some extents, which could have resulted in new
2300 if (num_inserts == max_inserts) { 2647 * blocks being allocated, which would result in new blocks needing
2301 INIT_LIST_HEAD(&insert_list); 2648 * updates, so if all is set we _must_ restart to get the updated
2302 INIT_LIST_HEAD(&update_list); 2649 * blocks.
2303 num_inserts = 0;
2304 goto again;
2305 }
2306
2307 /*
2308 * again, if we need to make absolutely sure there are no more pending
2309 * extent operations left and we know that we skipped some, go back to
2310 * the beginning and do it all again
2311 */ 2650 */
2312 if (all && skipped) { 2651 if (restart || all) {
2313 INIT_LIST_HEAD(&insert_list); 2652 INIT_LIST_HEAD(&insert_list);
2314 INIT_LIST_HEAD(&update_list); 2653 INIT_LIST_HEAD(&update_list);
2315 search = 0; 2654 search = 0;
2316 skipped = 0; 2655 restart = 0;
2317 num_inserts = 0; 2656 num_inserts = 0;
2318 goto again; 2657 goto again;
2319 } 2658 }
@@ -2547,6 +2886,7 @@ again:
2547 if (ret) { 2886 if (ret) {
2548 if (all && skipped && !nr) { 2887 if (all && skipped && !nr) {
2549 search = 0; 2888 search = 0;
2889 skipped = 0;
2550 continue; 2890 continue;
2551 } 2891 }
2552 mutex_unlock(&info->extent_ins_mutex); 2892 mutex_unlock(&info->extent_ins_mutex);
@@ -2633,6 +2973,8 @@ again:
2633 goto again; 2973 goto again;
2634 } 2974 }
2635 2975
2976 if (!err)
2977 finish_current_insert(trans, extent_root, 0);
2636 return err; 2978 return err;
2637} 2979}
2638 2980
@@ -2700,13 +3042,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2700 /* if metadata always pin */ 3042 /* if metadata always pin */
2701 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { 3043 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
2702 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3044 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
2703 struct btrfs_block_group_cache *cache; 3045 mutex_lock(&root->fs_info->pinned_mutex);
2704 3046 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2705 /* btrfs_free_reserved_extent */ 3047 mutex_unlock(&root->fs_info->pinned_mutex);
2706 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
2707 BUG_ON(!cache);
2708 btrfs_add_free_space(cache, bytenr, num_bytes);
2709 put_block_group(cache);
2710 update_reserved_extents(root, bytenr, num_bytes, 0); 3048 update_reserved_extents(root, bytenr, num_bytes, 0);
2711 return 0; 3049 return 0;
2712 } 3050 }
@@ -2787,7 +3125,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
2787 3125
2788 if (data & BTRFS_BLOCK_GROUP_METADATA) { 3126 if (data & BTRFS_BLOCK_GROUP_METADATA) {
2789 last_ptr = &root->fs_info->last_alloc; 3127 last_ptr = &root->fs_info->last_alloc;
2790 empty_cluster = 64 * 1024; 3128 if (!btrfs_test_opt(root, SSD))
3129 empty_cluster = 64 * 1024;
2791 } 3130 }
2792 3131
2793 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) 3132 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
@@ -3014,16 +3353,18 @@ loop_check:
3014static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 3353static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3015{ 3354{
3016 struct btrfs_block_group_cache *cache; 3355 struct btrfs_block_group_cache *cache;
3017 struct list_head *l;
3018 3356
3019 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 3357 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3020 (unsigned long long)(info->total_bytes - info->bytes_used - 3358 (unsigned long long)(info->total_bytes - info->bytes_used -
3021 info->bytes_pinned - info->bytes_reserved), 3359 info->bytes_pinned - info->bytes_reserved),
3022 (info->full) ? "" : "not "); 3360 (info->full) ? "" : "not ");
3361 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
3362 " may_use=%llu, used=%llu\n", info->total_bytes,
3363 info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
3364 info->bytes_used);
3023 3365
3024 down_read(&info->groups_sem); 3366 down_read(&info->groups_sem);
3025 list_for_each(l, &info->block_groups) { 3367 list_for_each_entry(cache, &info->block_groups, list) {
3026 cache = list_entry(l, struct btrfs_block_group_cache, list);
3027 spin_lock(&cache->lock); 3368 spin_lock(&cache->lock);
3028 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 3369 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3029 "%llu pinned %llu reserved\n", 3370 "%llu pinned %llu reserved\n",
@@ -3047,24 +3388,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3047{ 3388{
3048 int ret; 3389 int ret;
3049 u64 search_start = 0; 3390 u64 search_start = 0;
3050 u64 alloc_profile;
3051 struct btrfs_fs_info *info = root->fs_info; 3391 struct btrfs_fs_info *info = root->fs_info;
3052 3392
3053 if (data) { 3393 data = btrfs_get_alloc_profile(root, data);
3054 alloc_profile = info->avail_data_alloc_bits &
3055 info->data_alloc_profile;
3056 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
3057 } else if (root == root->fs_info->chunk_root) {
3058 alloc_profile = info->avail_system_alloc_bits &
3059 info->system_alloc_profile;
3060 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
3061 } else {
3062 alloc_profile = info->avail_metadata_alloc_bits &
3063 info->metadata_alloc_profile;
3064 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
3065 }
3066again: 3394again:
3067 data = btrfs_reduce_alloc_profile(root, data);
3068 /* 3395 /*
3069 * the only place that sets empty_size is btrfs_realloc_node, which 3396 * the only place that sets empty_size is btrfs_realloc_node, which
3070 * is not called recursively on allocations 3397 * is not called recursively on allocations
@@ -3332,7 +3659,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3332 3659
3333struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 3660struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3334 struct btrfs_root *root, 3661 struct btrfs_root *root,
3335 u64 bytenr, u32 blocksize) 3662 u64 bytenr, u32 blocksize,
3663 int level)
3336{ 3664{
3337 struct extent_buffer *buf; 3665 struct extent_buffer *buf;
3338 3666
@@ -3340,9 +3668,13 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3340 if (!buf) 3668 if (!buf)
3341 return ERR_PTR(-ENOMEM); 3669 return ERR_PTR(-ENOMEM);
3342 btrfs_set_header_generation(buf, trans->transid); 3670 btrfs_set_header_generation(buf, trans->transid);
3671 btrfs_set_buffer_lockdep_class(buf, level);
3343 btrfs_tree_lock(buf); 3672 btrfs_tree_lock(buf);
3344 clean_tree_block(trans, root, buf); 3673 clean_tree_block(trans, root, buf);
3674
3675 btrfs_set_lock_blocking(buf);
3345 btrfs_set_buffer_uptodate(buf); 3676 btrfs_set_buffer_uptodate(buf);
3677
3346 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 3678 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3347 set_extent_dirty(&root->dirty_log_pages, buf->start, 3679 set_extent_dirty(&root->dirty_log_pages, buf->start,
3348 buf->start + buf->len - 1, GFP_NOFS); 3680 buf->start + buf->len - 1, GFP_NOFS);
@@ -3351,6 +3683,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3351 buf->start + buf->len - 1, GFP_NOFS); 3683 buf->start + buf->len - 1, GFP_NOFS);
3352 } 3684 }
3353 trans->blocks_used++; 3685 trans->blocks_used++;
3686 /* this returns a buffer locked for blocking */
3354 return buf; 3687 return buf;
3355} 3688}
3356 3689
@@ -3379,7 +3712,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3379 return ERR_PTR(ret); 3712 return ERR_PTR(ret);
3380 } 3713 }
3381 3714
3382 buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); 3715 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
3716 blocksize, level);
3383 return buf; 3717 return buf;
3384} 3718}
3385 3719
@@ -3388,36 +3722,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3388{ 3722{
3389 u64 leaf_owner; 3723 u64 leaf_owner;
3390 u64 leaf_generation; 3724 u64 leaf_generation;
3725 struct refsort *sorted;
3391 struct btrfs_key key; 3726 struct btrfs_key key;
3392 struct btrfs_file_extent_item *fi; 3727 struct btrfs_file_extent_item *fi;
3393 int i; 3728 int i;
3394 int nritems; 3729 int nritems;
3395 int ret; 3730 int ret;
3731 int refi = 0;
3732 int slot;
3396 3733
3397 BUG_ON(!btrfs_is_leaf(leaf)); 3734 BUG_ON(!btrfs_is_leaf(leaf));
3398 nritems = btrfs_header_nritems(leaf); 3735 nritems = btrfs_header_nritems(leaf);
3399 leaf_owner = btrfs_header_owner(leaf); 3736 leaf_owner = btrfs_header_owner(leaf);
3400 leaf_generation = btrfs_header_generation(leaf); 3737 leaf_generation = btrfs_header_generation(leaf);
3401 3738
3739 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3740 /* we do this loop twice. The first time we build a list
3741 * of the extents we have a reference on, then we sort the list
3742 * by bytenr. The second time around we actually do the
3743 * extent freeing.
3744 */
3402 for (i = 0; i < nritems; i++) { 3745 for (i = 0; i < nritems; i++) {
3403 u64 disk_bytenr; 3746 u64 disk_bytenr;
3404 cond_resched(); 3747 cond_resched();
3405 3748
3406 btrfs_item_key_to_cpu(leaf, &key, i); 3749 btrfs_item_key_to_cpu(leaf, &key, i);
3750
3751 /* only extents have references, skip everything else */
3407 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3752 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3408 continue; 3753 continue;
3754
3409 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3755 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3756
3757 /* inline extents live in the btree, they don't have refs */
3410 if (btrfs_file_extent_type(leaf, fi) == 3758 if (btrfs_file_extent_type(leaf, fi) ==
3411 BTRFS_FILE_EXTENT_INLINE) 3759 BTRFS_FILE_EXTENT_INLINE)
3412 continue; 3760 continue;
3413 /* 3761
3414 * FIXME make sure to insert a trans record that
3415 * repeats the snapshot del on crash
3416 */
3417 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 3762 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3763
3764 /* holes don't have refs */
3418 if (disk_bytenr == 0) 3765 if (disk_bytenr == 0)
3419 continue; 3766 continue;
3420 3767
3768 sorted[refi].bytenr = disk_bytenr;
3769 sorted[refi].slot = i;
3770 refi++;
3771 }
3772
3773 if (refi == 0)
3774 goto out;
3775
3776 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3777
3778 for (i = 0; i < refi; i++) {
3779 u64 disk_bytenr;
3780
3781 disk_bytenr = sorted[i].bytenr;
3782 slot = sorted[i].slot;
3783
3784 cond_resched();
3785
3786 btrfs_item_key_to_cpu(leaf, &key, slot);
3787 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3788 continue;
3789
3790 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3791
3421 ret = __btrfs_free_extent(trans, root, disk_bytenr, 3792 ret = __btrfs_free_extent(trans, root, disk_bytenr,
3422 btrfs_file_extent_disk_num_bytes(leaf, fi), 3793 btrfs_file_extent_disk_num_bytes(leaf, fi),
3423 leaf->start, leaf_owner, leaf_generation, 3794 leaf->start, leaf_owner, leaf_generation,
@@ -3428,6 +3799,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3428 wake_up(&root->fs_info->transaction_throttle); 3799 wake_up(&root->fs_info->transaction_throttle);
3429 cond_resched(); 3800 cond_resched();
3430 } 3801 }
3802out:
3803 kfree(sorted);
3431 return 0; 3804 return 0;
3432} 3805}
3433 3806
@@ -3437,9 +3810,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3437{ 3810{
3438 int i; 3811 int i;
3439 int ret; 3812 int ret;
3440 struct btrfs_extent_info *info = ref->extents; 3813 struct btrfs_extent_info *info;
3814 struct refsort *sorted;
3815
3816 if (ref->nritems == 0)
3817 return 0;
3441 3818
3819 sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3442 for (i = 0; i < ref->nritems; i++) { 3820 for (i = 0; i < ref->nritems; i++) {
3821 sorted[i].bytenr = ref->extents[i].bytenr;
3822 sorted[i].slot = i;
3823 }
3824 sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3825
3826 /*
3827 * the items in the ref were sorted when the ref was inserted
3828 * into the ref cache, so this is already in order
3829 */
3830 for (i = 0; i < ref->nritems; i++) {
3831 info = ref->extents + sorted[i].slot;
3443 ret = __btrfs_free_extent(trans, root, info->bytenr, 3832 ret = __btrfs_free_extent(trans, root, info->bytenr,
3444 info->num_bytes, ref->bytenr, 3833 info->num_bytes, ref->bytenr,
3445 ref->owner, ref->generation, 3834 ref->owner, ref->generation,
@@ -3453,6 +3842,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3453 info++; 3842 info++;
3454 } 3843 }
3455 3844
3845 kfree(sorted);
3456 return 0; 3846 return 0;
3457} 3847}
3458 3848
@@ -3497,6 +3887,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3497} 3887}
3498 3888
3499/* 3889/*
3890 * this is used while deleting old snapshots, and it drops the refs
3891 * on a whole subtree starting from a level 1 node.
3892 *
3893 * The idea is to sort all the leaf pointers, and then drop the
3894 * ref on all the leaves in order. Most of the time the leaves
3895 * will have ref cache entries, so no leaf IOs will be required to
3896 * find the extents they have references on.
3897 *
3898 * For each leaf, any references it has are also dropped in order
3899 *
3900 * This ends up dropping the references in something close to optimal
3901 * order for reading and modifying the extent allocation tree.
3902 */
3903static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3904 struct btrfs_root *root,
3905 struct btrfs_path *path)
3906{
3907 u64 bytenr;
3908 u64 root_owner;
3909 u64 root_gen;
3910 struct extent_buffer *eb = path->nodes[1];
3911 struct extent_buffer *leaf;
3912 struct btrfs_leaf_ref *ref;
3913 struct refsort *sorted = NULL;
3914 int nritems = btrfs_header_nritems(eb);
3915 int ret;
3916 int i;
3917 int refi = 0;
3918 int slot = path->slots[1];
3919 u32 blocksize = btrfs_level_size(root, 0);
3920 u32 refs;
3921
3922 if (nritems == 0)
3923 goto out;
3924
3925 root_owner = btrfs_header_owner(eb);
3926 root_gen = btrfs_header_generation(eb);
3927 sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3928
3929 /*
3930 * step one, sort all the leaf pointers so we don't scribble
3931 * randomly into the extent allocation tree
3932 */
3933 for (i = slot; i < nritems; i++) {
3934 sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3935 sorted[refi].slot = i;
3936 refi++;
3937 }
3938
3939 /*
3940 * nritems won't be zero, but if we're picking up drop_snapshot
3941 * after a crash, slot might be > 0, so double check things
3942 * just in case.
3943 */
3944 if (refi == 0)
3945 goto out;
3946
3947 sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3948
3949 /*
3950 * the first loop frees everything the leaves point to
3951 */
3952 for (i = 0; i < refi; i++) {
3953 u64 ptr_gen;
3954
3955 bytenr = sorted[i].bytenr;
3956
3957 /*
3958 * check the reference count on this leaf. If it is > 1
3959 * we just decrement it below and don't update any
3960 * of the refs the leaf points to.
3961 */
3962 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3963 BUG_ON(ret);
3964 if (refs != 1)
3965 continue;
3966
3967 ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3968
3969 /*
3970 * the leaf only had one reference, which means the
3971 * only thing pointing to this leaf is the snapshot
3972 * we're deleting. It isn't possible for the reference
3973 * count to increase again later
3974 *
3975 * The reference cache is checked for the leaf,
3976 * and if found we'll be able to drop any refs held by
3977 * the leaf without needing to read it in.
3978 */
3979 ref = btrfs_lookup_leaf_ref(root, bytenr);
3980 if (ref && ref->generation != ptr_gen) {
3981 btrfs_free_leaf_ref(root, ref);
3982 ref = NULL;
3983 }
3984 if (ref) {
3985 ret = cache_drop_leaf_ref(trans, root, ref);
3986 BUG_ON(ret);
3987 btrfs_remove_leaf_ref(root, ref);
3988 btrfs_free_leaf_ref(root, ref);
3989 } else {
3990 /*
3991 * the leaf wasn't in the reference cache, so
3992 * we have to read it.
3993 */
3994 leaf = read_tree_block(root, bytenr, blocksize,
3995 ptr_gen);
3996 ret = btrfs_drop_leaf_ref(trans, root, leaf);
3997 BUG_ON(ret);
3998 free_extent_buffer(leaf);
3999 }
4000 atomic_inc(&root->fs_info->throttle_gen);
4001 wake_up(&root->fs_info->transaction_throttle);
4002 cond_resched();
4003 }
4004
4005 /*
4006 * run through the loop again to free the refs on the leaves.
4007 * This is faster than doing it in the loop above because
4008 * the leaves are likely to be clustered together. We end up
4009 * working in nice chunks on the extent allocation tree.
4010 */
4011 for (i = 0; i < refi; i++) {
4012 bytenr = sorted[i].bytenr;
4013 ret = __btrfs_free_extent(trans, root, bytenr,
4014 blocksize, eb->start,
4015 root_owner, root_gen, 0, 1);
4016 BUG_ON(ret);
4017
4018 atomic_inc(&root->fs_info->throttle_gen);
4019 wake_up(&root->fs_info->transaction_throttle);
4020 cond_resched();
4021 }
4022out:
4023 kfree(sorted);
4024
4025 /*
4026 * update the path to show we've processed the entire level 1
4027 * node. This will get saved into the root's drop_snapshot_progress
4028 * field so these drops are not repeated again if this transaction
4029 * commits.
4030 */
4031 path->slots[1] = nritems;
4032 return 0;
4033}
4034
4035/*
3500 * helper function for drop_snapshot, this walks down the tree dropping ref 4036 * helper function for drop_snapshot, this walks down the tree dropping ref
3501 * counts as it goes. 4037 * counts as it goes.
3502 */ 4038 */
@@ -3511,7 +4047,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3511 struct extent_buffer *next; 4047 struct extent_buffer *next;
3512 struct extent_buffer *cur; 4048 struct extent_buffer *cur;
3513 struct extent_buffer *parent; 4049 struct extent_buffer *parent;
3514 struct btrfs_leaf_ref *ref;
3515 u32 blocksize; 4050 u32 blocksize;
3516 int ret; 4051 int ret;
3517 u32 refs; 4052 u32 refs;
@@ -3538,17 +4073,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3538 if (path->slots[*level] >= 4073 if (path->slots[*level] >=
3539 btrfs_header_nritems(cur)) 4074 btrfs_header_nritems(cur))
3540 break; 4075 break;
4076
4077 /* the new code goes down to level 1 and does all the
4078 * leaves pointed to that node in bulk. So, this check
4079 * for level 0 will always be false.
4080 *
4081 * But, the disk format allows the drop_snapshot_progress
4082 * field in the root to leave things in a state where
4083 * a leaf will need cleaning up here. If someone crashes
4084 * with the old code and then boots with the new code,
4085 * we might find a leaf here.
4086 */
3541 if (*level == 0) { 4087 if (*level == 0) {
3542 ret = btrfs_drop_leaf_ref(trans, root, cur); 4088 ret = btrfs_drop_leaf_ref(trans, root, cur);
3543 BUG_ON(ret); 4089 BUG_ON(ret);
3544 break; 4090 break;
3545 } 4091 }
4092
4093 /*
4094 * once we get to level one, process the whole node
4095 * at once, including everything below it.
4096 */
4097 if (*level == 1) {
4098 ret = drop_level_one_refs(trans, root, path);
4099 BUG_ON(ret);
4100 break;
4101 }
4102
3546 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 4103 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
3547 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 4104 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
3548 blocksize = btrfs_level_size(root, *level - 1); 4105 blocksize = btrfs_level_size(root, *level - 1);
3549 4106
3550 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); 4107 ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3551 BUG_ON(ret); 4108 BUG_ON(ret);
4109
4110 /*
4111 * if there is more than one reference, we don't need
4112 * to read that node to drop any references it has. We
4113 * just drop the ref we hold on that node and move on to the
4114 * next slot in this level.
4115 */
3552 if (refs != 1) { 4116 if (refs != 1) {
3553 parent = path->nodes[*level]; 4117 parent = path->nodes[*level];
3554 root_owner = btrfs_header_owner(parent); 4118 root_owner = btrfs_header_owner(parent);
@@ -3567,46 +4131,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
3567 4131
3568 continue; 4132 continue;
3569 } 4133 }
4134
3570 /* 4135 /*
3571 * at this point, we have a single ref, and since the 4136 * we need to keep freeing things in the next level down.
3572 * only place referencing this extent is a dead root 4137 * read the block and loop around to process it
3573 * the reference count should never go higher.
3574 * So, we don't need to check it again
3575 */ 4138 */
3576 if (*level == 1) { 4139 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3577 ref = btrfs_lookup_leaf_ref(root, bytenr);
3578 if (ref && ref->generation != ptr_gen) {
3579 btrfs_free_leaf_ref(root, ref);
3580 ref = NULL;
3581 }
3582 if (ref) {
3583 ret = cache_drop_leaf_ref(trans, root, ref);
3584 BUG_ON(ret);
3585 btrfs_remove_leaf_ref(root, ref);
3586 btrfs_free_leaf_ref(root, ref);
3587 *level = 0;
3588 break;
3589 }
3590 }
3591 next = btrfs_find_tree_block(root, bytenr, blocksize);
3592 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
3593 free_extent_buffer(next);
3594
3595 next = read_tree_block(root, bytenr, blocksize,
3596 ptr_gen);
3597 cond_resched();
3598#if 0
3599 /*
3600 * this is a debugging check and can go away
3601 * the ref should never go all the way down to 1
3602 * at this point
3603 */
3604 ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
3605 &refs);
3606 BUG_ON(ret);
3607 WARN_ON(refs != 1);
3608#endif
3609 }
3610 WARN_ON(*level <= 0); 4140 WARN_ON(*level <= 0);
3611 if (path->nodes[*level-1]) 4141 if (path->nodes[*level-1])
3612 free_extent_buffer(path->nodes[*level-1]); 4142 free_extent_buffer(path->nodes[*level-1]);
@@ -3631,11 +4161,16 @@ out:
3631 root_owner = btrfs_header_owner(parent); 4161 root_owner = btrfs_header_owner(parent);
3632 root_gen = btrfs_header_generation(parent); 4162 root_gen = btrfs_header_generation(parent);
3633 4163
4164 /*
4165 * cleanup and free the reference on the last node
4166 * we processed
4167 */
3634 ret = __btrfs_free_extent(trans, root, bytenr, blocksize, 4168 ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
3635 parent->start, root_owner, root_gen, 4169 parent->start, root_owner, root_gen,
3636 *level, 1); 4170 *level, 1);
3637 free_extent_buffer(path->nodes[*level]); 4171 free_extent_buffer(path->nodes[*level]);
3638 path->nodes[*level] = NULL; 4172 path->nodes[*level] = NULL;
4173
3639 *level += 1; 4174 *level += 1;
3640 BUG_ON(ret); 4175 BUG_ON(ret);
3641 4176
@@ -3687,6 +4222,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
3687 4222
3688 next = read_tree_block(root, bytenr, blocksize, ptr_gen); 4223 next = read_tree_block(root, bytenr, blocksize, ptr_gen);
3689 btrfs_tree_lock(next); 4224 btrfs_tree_lock(next);
4225 btrfs_set_lock_blocking(next);
3690 4226
3691 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, 4227 ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
3692 &refs); 4228 &refs);
@@ -3754,6 +4290,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3754 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 4290 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
3755 struct extent_buffer *node; 4291 struct extent_buffer *node;
3756 struct btrfs_disk_key disk_key; 4292 struct btrfs_disk_key disk_key;
4293
4294 /*
4295 * there is more work to do in this level.
4296 * Update the drop_progress marker to reflect
4297 * the work we've done so far, and then bump
4298 * the slot number
4299 */
3757 node = path->nodes[i]; 4300 node = path->nodes[i];
3758 path->slots[i]++; 4301 path->slots[i]++;
3759 *level = i; 4302 *level = i;
@@ -3765,6 +4308,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
3765 return 0; 4308 return 0;
3766 } else { 4309 } else {
3767 struct extent_buffer *parent; 4310 struct extent_buffer *parent;
4311
4312 /*
4313 * this whole node is done, free our reference
4314 * on it and go up one level
4315 */
3768 if (path->nodes[*level] == root->node) 4316 if (path->nodes[*level] == root->node)
3769 parent = path->nodes[*level]; 4317 parent = path->nodes[*level];
3770 else 4318 else
@@ -3891,13 +4439,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
3891 path = btrfs_alloc_path(); 4439 path = btrfs_alloc_path();
3892 BUG_ON(!path); 4440 BUG_ON(!path);
3893 4441
3894 BUG_ON(!btrfs_tree_locked(parent)); 4442 btrfs_assert_tree_locked(parent);
3895 parent_level = btrfs_header_level(parent); 4443 parent_level = btrfs_header_level(parent);
3896 extent_buffer_get(parent); 4444 extent_buffer_get(parent);
3897 path->nodes[parent_level] = parent; 4445 path->nodes[parent_level] = parent;
3898 path->slots[parent_level] = btrfs_header_nritems(parent); 4446 path->slots[parent_level] = btrfs_header_nritems(parent);
3899 4447
3900 BUG_ON(!btrfs_tree_locked(node)); 4448 btrfs_assert_tree_locked(node);
3901 level = btrfs_header_level(node); 4449 level = btrfs_header_level(node);
3902 extent_buffer_get(node); 4450 extent_buffer_get(node);
3903 path->nodes[level] = node; 4451 path->nodes[level] = node;
@@ -4444,7 +4992,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4444 u64 lock_end = 0; 4992 u64 lock_end = 0;
4445 u64 num_bytes; 4993 u64 num_bytes;
4446 u64 ext_offset; 4994 u64 ext_offset;
4447 u64 first_pos; 4995 u64 search_end = (u64)-1;
4448 u32 nritems; 4996 u32 nritems;
4449 int nr_scaned = 0; 4997 int nr_scaned = 0;
4450 int extent_locked = 0; 4998 int extent_locked = 0;
@@ -4452,7 +5000,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4452 int ret; 5000 int ret;
4453 5001
4454 memcpy(&key, leaf_key, sizeof(key)); 5002 memcpy(&key, leaf_key, sizeof(key));
4455 first_pos = INT_LIMIT(loff_t) - extent_key->offset;
4456 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { 5003 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
4457 if (key.objectid < ref_path->owner_objectid || 5004 if (key.objectid < ref_path->owner_objectid ||
4458 (key.objectid == ref_path->owner_objectid && 5005 (key.objectid == ref_path->owner_objectid &&
@@ -4501,7 +5048,7 @@ next:
4501 if ((key.objectid > ref_path->owner_objectid) || 5048 if ((key.objectid > ref_path->owner_objectid) ||
4502 (key.objectid == ref_path->owner_objectid && 5049 (key.objectid == ref_path->owner_objectid &&
4503 key.type > BTRFS_EXTENT_DATA_KEY) || 5050 key.type > BTRFS_EXTENT_DATA_KEY) ||
4504 (key.offset >= first_pos + extent_key->offset)) 5051 key.offset >= search_end)
4505 break; 5052 break;
4506 } 5053 }
4507 5054
@@ -4534,8 +5081,10 @@ next:
4534 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 5081 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4535 ext_offset = btrfs_file_extent_offset(leaf, fi); 5082 ext_offset = btrfs_file_extent_offset(leaf, fi);
4536 5083
4537 if (first_pos > key.offset - ext_offset) 5084 if (search_end == (u64)-1) {
4538 first_pos = key.offset - ext_offset; 5085 search_end = key.offset - ext_offset +
5086 btrfs_file_extent_ram_bytes(leaf, fi);
5087 }
4539 5088
4540 if (!extent_locked) { 5089 if (!extent_locked) {
4541 lock_start = key.offset; 5090 lock_start = key.offset;
@@ -4724,7 +5273,7 @@ next:
4724 } 5273 }
4725skip: 5274skip:
4726 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && 5275 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
4727 key.offset >= first_pos + extent_key->offset) 5276 key.offset >= search_end)
4728 break; 5277 break;
4729 5278
4730 cond_resched(); 5279 cond_resched();
@@ -4778,6 +5327,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
4778 ref->bytenr = buf->start; 5327 ref->bytenr = buf->start;
4779 ref->owner = btrfs_header_owner(buf); 5328 ref->owner = btrfs_header_owner(buf);
4780 ref->generation = btrfs_header_generation(buf); 5329 ref->generation = btrfs_header_generation(buf);
5330
4781 ret = btrfs_add_leaf_ref(root, ref, 0); 5331 ret = btrfs_add_leaf_ref(root, ref, 0);
4782 WARN_ON(ret); 5332 WARN_ON(ret);
4783 btrfs_free_leaf_ref(root, ref); 5333 btrfs_free_leaf_ref(root, ref);
@@ -5351,7 +5901,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5351 prev_block = block_start; 5901 prev_block = block_start;
5352 } 5902 }
5353 5903
5904 mutex_lock(&extent_root->fs_info->trans_mutex);
5354 btrfs_record_root_in_trans(found_root); 5905 btrfs_record_root_in_trans(found_root);
5906 mutex_unlock(&extent_root->fs_info->trans_mutex);
5355 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { 5907 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5356 /* 5908 /*
5357 * try to update data extent references while 5909 * try to update data extent references while
@@ -5789,6 +6341,7 @@ out:
5789int btrfs_free_block_groups(struct btrfs_fs_info *info) 6341int btrfs_free_block_groups(struct btrfs_fs_info *info)
5790{ 6342{
5791 struct btrfs_block_group_cache *block_group; 6343 struct btrfs_block_group_cache *block_group;
6344 struct btrfs_space_info *space_info;
5792 struct rb_node *n; 6345 struct rb_node *n;
5793 6346
5794 spin_lock(&info->block_group_cache_lock); 6347 spin_lock(&info->block_group_cache_lock);
@@ -5810,6 +6363,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
5810 spin_lock(&info->block_group_cache_lock); 6363 spin_lock(&info->block_group_cache_lock);
5811 } 6364 }
5812 spin_unlock(&info->block_group_cache_lock); 6365 spin_unlock(&info->block_group_cache_lock);
6366
6367 /* now that all the block groups are freed, go through and
6368 * free all the space_info structs. This is only called during
6369 * the final stages of unmount, and so we know nobody is
6370 * using them. We call synchronize_rcu() once before we start,
6371 * just to be on the safe side.
6372 */
6373 synchronize_rcu();
6374
6375 while(!list_empty(&info->space_info)) {
6376 space_info = list_entry(info->space_info.next,
6377 struct btrfs_space_info,
6378 list);
6379
6380 list_del(&space_info->list);
6381 kfree(space_info);
6382 }
5813 return 0; 6383 return 0;
5814} 6384}
5815 6385
@@ -5957,9 +6527,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5957 path = btrfs_alloc_path(); 6527 path = btrfs_alloc_path();
5958 BUG_ON(!path); 6528 BUG_ON(!path);
5959 6529
5960 btrfs_remove_free_space_cache(block_group); 6530 spin_lock(&root->fs_info->block_group_cache_lock);
5961 rb_erase(&block_group->cache_node, 6531 rb_erase(&block_group->cache_node,
5962 &root->fs_info->block_group_cache_tree); 6532 &root->fs_info->block_group_cache_tree);
6533 spin_unlock(&root->fs_info->block_group_cache_lock);
6534 btrfs_remove_free_space_cache(block_group);
5963 down_write(&block_group->space_info->groups_sem); 6535 down_write(&block_group->space_info->groups_sem);
5964 list_del(&block_group->list); 6536 list_del(&block_group->list);
5965 up_write(&block_group->space_info->groups_sem); 6537 up_write(&block_group->space_info->groups_sem);