aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJosef Bacik <josef@redhat.com>2009-07-13 21:29:25 -0400
committerChris Mason <chris.mason@oracle.com>2009-07-24 09:23:30 -0400
commit963030817060e4f109be1993b9ae8f81dbf5e11a (patch)
tree7d81121b7e68d3d5b3317afba53d36bc1bf8221a /fs
parent83121942b28daffc9526b14b7843d8cdbd3db641 (diff)
Btrfs: use hybrid extents+bitmap rb tree for free space
Currently btrfs has a problem where it can use a ridiculous amount of RAM simply tracking free space. As free space gets fragmented, we end up with thousands of entries on an rb-tree per block group, which usually spans 1 gig of area. Since we currently don't ever flush free space cache back to disk this gets to be a bit unweildly on large fs's with lots of fragmentation. This patch solves this problem by using PAGE_SIZE bitmaps for parts of the free space cache. Initially we calculate a threshold of extent entries we can handle, which is however many extent entries we can cram into 16k of ram. The maximum amount of RAM that should ever be used to track 1 gigabyte of diskspace will be 32k of RAM, which scales much better than we did before. Once we pass the extent threshold, we start adding bitmaps and using those instead for tracking the free space. This patch also makes it so that any free space thats less than 4 * sectorsize we go ahead and put into a bitmap. This is nice since we try and allocate out of the front of a block group, so if the front of a block group is heavily fragmented and then has a huge chunk of free space at the end, we go ahead and add the fragmented areas to bitmaps and use a normal extent entry to track the big chunk at the back of the block group. I've also taken the opportunity to revamp how we search for free space. Previously we indexed free space via an offset indexed rb tree and a bytes indexed rb tree. I've dropped the bytes indexed rb tree and use only the offset indexed rb tree. This cuts the number of tree operations we were doing previously down by half, and gives us a little bit of a better allocation pattern since we will always start from a specific offset and search forward from there, instead of searching for the size we need and try and get it as close as possible to the offset we want. I've given this a healthy amount of testing pre-new format stuff, as well as post-new format stuff. I've booted up my fedora box which is installed on btrfs with this patch and ran with it for a few days without issues. I've not seen any performance regressions in any of my tests. Since the last patch Yan Zheng fixed a problem where we could have overlapping entries, so updating their offset inline would cause problems. Thanks, Signed-off-by: Josef Bacik <jbacik@redhat.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/ctree.h8
-rw-r--r--fs/btrfs/extent-tree.c25
-rw-r--r--fs/btrfs/free-space-cache.c1001
-rw-r--r--fs/btrfs/free-space-cache.h8
4 files changed, 826 insertions, 216 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index da0763135bf0..0cbf3491bb7c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -709,6 +709,9 @@ struct btrfs_free_cluster {
709 /* first extent starting offset */ 709 /* first extent starting offset */
710 u64 window_start; 710 u64 window_start;
711 711
712 /* if this cluster simply points at a bitmap in the block group */
713 bool points_to_bitmap;
714
712 struct btrfs_block_group_cache *block_group; 715 struct btrfs_block_group_cache *block_group;
713 /* 716 /*
714 * when a cluster is allocated from a block group, we put the 717 * when a cluster is allocated from a block group, we put the
@@ -726,6 +729,10 @@ struct btrfs_block_group_cache {
726 u64 pinned; 729 u64 pinned;
727 u64 reserved; 730 u64 reserved;
728 u64 flags; 731 u64 flags;
732 u64 sectorsize;
733 int extents_thresh;
734 int free_extents;
735 int total_bitmaps;
729 int cached; 736 int cached;
730 int ro; 737 int ro;
731 int dirty; 738 int dirty;
@@ -734,7 +741,6 @@ struct btrfs_block_group_cache {
734 741
735 /* free space cache stuff */ 742 /* free space cache stuff */
736 spinlock_t tree_lock; 743 spinlock_t tree_lock;
737 struct rb_root free_space_bytes;
738 struct rb_root free_space_offset; 744 struct rb_root free_space_offset;
739 745
740 /* block group cache stuff */ 746 /* block group cache stuff */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 62a332d34fdb..98697be6bdde 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3649,7 +3649,6 @@ refill_cluster:
3649 goto loop; 3649 goto loop;
3650checks: 3650checks:
3651 search_start = stripe_align(root, offset); 3651 search_start = stripe_align(root, offset);
3652
3653 /* move on to the next group */ 3652 /* move on to the next group */
3654 if (search_start + num_bytes >= search_end) { 3653 if (search_start + num_bytes >= search_end) {
3655 btrfs_add_free_space(block_group, offset, num_bytes); 3654 btrfs_add_free_space(block_group, offset, num_bytes);
@@ -7040,6 +7039,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7040 mutex_init(&cache->cache_mutex); 7039 mutex_init(&cache->cache_mutex);
7041 INIT_LIST_HEAD(&cache->list); 7040 INIT_LIST_HEAD(&cache->list);
7042 INIT_LIST_HEAD(&cache->cluster_list); 7041 INIT_LIST_HEAD(&cache->cluster_list);
7042 cache->sectorsize = root->sectorsize;
7043
7044 /*
7045 * we only want to have 32k of ram per block group for keeping
7046 * track of free space, and if we pass 1/2 of that we want to
7047 * start converting things over to using bitmaps
7048 */
7049 cache->extents_thresh = ((1024 * 32) / 2) /
7050 sizeof(struct btrfs_free_space);
7051
7043 read_extent_buffer(leaf, &cache->item, 7052 read_extent_buffer(leaf, &cache->item,
7044 btrfs_item_ptr_offset(leaf, path->slots[0]), 7053 btrfs_item_ptr_offset(leaf, path->slots[0]),
7045 sizeof(cache->item)); 7054 sizeof(cache->item));
@@ -7091,6 +7100,15 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7091 cache->key.objectid = chunk_offset; 7100 cache->key.objectid = chunk_offset;
7092 cache->key.offset = size; 7101 cache->key.offset = size;
7093 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 7102 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7103 cache->sectorsize = root->sectorsize;
7104
7105 /*
7106 * we only want to have 32k of ram per block group for keeping track
7107 * of free space, and if we pass 1/2 of that we want to start
7108 * converting things over to using bitmaps
7109 */
7110 cache->extents_thresh = ((1024 * 32) / 2) /
7111 sizeof(struct btrfs_free_space);
7094 atomic_set(&cache->count, 1); 7112 atomic_set(&cache->count, 1);
7095 spin_lock_init(&cache->lock); 7113 spin_lock_init(&cache->lock);
7096 spin_lock_init(&cache->tree_lock); 7114 spin_lock_init(&cache->tree_lock);
@@ -7103,6 +7121,11 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7103 cache->flags = type; 7121 cache->flags = type;
7104 btrfs_set_block_group_flags(&cache->item, type); 7122 btrfs_set_block_group_flags(&cache->item, type);
7105 7123
7124 cache->cached = 1;
7125 ret = btrfs_add_free_space(cache, chunk_offset, size);
7126 BUG_ON(ret);
7127 remove_sb_from_cache(root, cache);
7128
7106 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7129 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7107 &cache->space_info); 7130 &cache->space_info);
7108 BUG_ON(ret); 7131 BUG_ON(ret);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4538e48581a5..ab8cad8b46c9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,45 +16,46 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/pagemap.h>
19#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/math64.h>
20#include "ctree.h" 22#include "ctree.h"
21#include "free-space-cache.h" 23#include "free-space-cache.h"
22#include "transaction.h" 24#include "transaction.h"
23 25
24struct btrfs_free_space { 26#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
25 struct rb_node bytes_index; 27#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
26 struct rb_node offset_index;
27 u64 offset;
28 u64 bytes;
29};
30 28
31static int tree_insert_offset(struct rb_root *root, u64 offset, 29static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
32 struct rb_node *node) 30 u64 offset)
33{ 31{
34 struct rb_node **p = &root->rb_node; 32 BUG_ON(offset < bitmap_start);
35 struct rb_node *parent = NULL; 33 offset -= bitmap_start;
36 struct btrfs_free_space *info; 34 return (unsigned long)(div64_u64(offset, sectorsize));
35}
37 36
38 while (*p) { 37static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
39 parent = *p; 38{
40 info = rb_entry(parent, struct btrfs_free_space, offset_index); 39 return (unsigned long)(div64_u64(bytes, sectorsize));
40}
41 41
42 if (offset < info->offset) 42static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
43 p = &(*p)->rb_left; 43 u64 offset)
44 else if (offset > info->offset) 44{
45 p = &(*p)->rb_right; 45 u64 bitmap_start;
46 else 46 u64 bytes_per_bitmap;
47 return -EEXIST;
48 }
49 47
50 rb_link_node(node, parent, p); 48 bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
51 rb_insert_color(node, root); 49 bitmap_start = offset - block_group->key.objectid;
50 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
51 bitmap_start *= bytes_per_bitmap;
52 bitmap_start += block_group->key.objectid;
52 53
53 return 0; 54 return bitmap_start;
54} 55}
55 56
56static int tree_insert_bytes(struct rb_root *root, u64 bytes, 57static int tree_insert_offset(struct rb_root *root, u64 offset,
57 struct rb_node *node) 58 struct rb_node *node, int bitmap)
58{ 59{
59 struct rb_node **p = &root->rb_node; 60 struct rb_node **p = &root->rb_node;
60 struct rb_node *parent = NULL; 61 struct rb_node *parent = NULL;
@@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
62 63
63 while (*p) { 64 while (*p) {
64 parent = *p; 65 parent = *p;
65 info = rb_entry(parent, struct btrfs_free_space, bytes_index); 66 info = rb_entry(parent, struct btrfs_free_space, offset_index);
66 67
67 if (bytes < info->bytes) 68 if (offset < info->offset) {
68 p = &(*p)->rb_left; 69 p = &(*p)->rb_left;
69 else 70 } else if (offset > info->offset) {
70 p = &(*p)->rb_right; 71 p = &(*p)->rb_right;
72 } else {
73 /*
74 * we could have a bitmap entry and an extent entry
75 * share the same offset. If this is the case, we want
76 * the extent entry to always be found first if we do a
77 * linear search through the tree, since we want to have
78 * the quickest allocation time, and allocating from an
79 * extent is faster than allocating from a bitmap. So
80 * if we're inserting a bitmap and we find an entry at
81 * this offset, we want to go right, or after this entry
82 * logically. If we are inserting an extent and we've
83 * found a bitmap, we want to go left, or before
84 * logically.
85 */
86 if (bitmap) {
87 WARN_ON(info->bitmap);
88 p = &(*p)->rb_right;
89 } else {
90 WARN_ON(!info->bitmap);
91 p = &(*p)->rb_left;
92 }
93 }
71 } 94 }
72 95
73 rb_link_node(node, parent, p); 96 rb_link_node(node, parent, p);
@@ -79,110 +102,142 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
79/* 102/*
80 * searches the tree for the given offset. 103 * searches the tree for the given offset.
81 * 104 *
82 * fuzzy == 1: this is used for allocations where we are given a hint of where 105 * fuzzy - If this is set, then we are trying to make an allocation, and we just
83 * to look for free space. Because the hint may not be completely on an offset 106 * want a section that has at least bytes size and comes at or after the given
84 * mark, or the hint may no longer point to free space we need to fudge our 107 * offset.
85 * results a bit. So we look for free space starting at or after offset with at
86 * least bytes size. We prefer to find as close to the given offset as we can.
87 * Also if the offset is within a free space range, then we will return the free
88 * space that contains the given offset, which means we can return a free space
89 * chunk with an offset before the provided offset.
90 *
91 * fuzzy == 0: this is just a normal tree search. Give us the free space that
92 * starts at the given offset which is at least bytes size, and if its not there
93 * return NULL.
94 */ 108 */
95static struct btrfs_free_space *tree_search_offset(struct rb_root *root, 109static struct btrfs_free_space *
96 u64 offset, u64 bytes, 110tree_search_offset(struct btrfs_block_group_cache *block_group,
97 int fuzzy) 111 u64 offset, int bitmap_only, int fuzzy)
98{ 112{
99 struct rb_node *n = root->rb_node; 113 struct rb_node *n = block_group->free_space_offset.rb_node;
100 struct btrfs_free_space *entry, *ret = NULL; 114 struct btrfs_free_space *entry, *prev = NULL;
115
116 /* find entry that is closest to the 'offset' */
117 while (1) {
118 if (!n) {
119 entry = NULL;
120 break;
121 }
101 122
102 while (n) {
103 entry = rb_entry(n, struct btrfs_free_space, offset_index); 123 entry = rb_entry(n, struct btrfs_free_space, offset_index);
124 prev = entry;
104 125
105 if (offset < entry->offset) { 126 if (offset < entry->offset)
106 if (fuzzy &&
107 (!ret || entry->offset < ret->offset) &&
108 (bytes <= entry->bytes))
109 ret = entry;
110 n = n->rb_left; 127 n = n->rb_left;
111 } else if (offset > entry->offset) { 128 else if (offset > entry->offset)
112 if (fuzzy &&
113 (entry->offset + entry->bytes - 1) >= offset &&
114 bytes <= entry->bytes) {
115 ret = entry;
116 break;
117 }
118 n = n->rb_right; 129 n = n->rb_right;
119 } else { 130 else
120 if (bytes > entry->bytes) {
121 n = n->rb_right;
122 continue;
123 }
124 ret = entry;
125 break; 131 break;
126 }
127 } 132 }
128 133
129 return ret; 134 if (bitmap_only) {
130} 135 if (!entry)
131 136 return NULL;
132/* 137 if (entry->bitmap)
133 * return a chunk at least bytes size, as close to offset that we can get. 138 return entry;
134 */
135static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
136 u64 offset, u64 bytes)
137{
138 struct rb_node *n = root->rb_node;
139 struct btrfs_free_space *entry, *ret = NULL;
140 139
141 while (n) { 140 /*
142 entry = rb_entry(n, struct btrfs_free_space, bytes_index); 141 * bitmap entry and extent entry may share same offset,
142 * in that case, bitmap entry comes after extent entry.
143 */
144 n = rb_next(n);
145 if (!n)
146 return NULL;
147 entry = rb_entry(n, struct btrfs_free_space, offset_index);
148 if (entry->offset != offset)
149 return NULL;
143 150
144 if (bytes < entry->bytes) { 151 WARN_ON(!entry->bitmap);
152 return entry;
153 } else if (entry) {
154 if (entry->bitmap) {
145 /* 155 /*
146 * We prefer to get a hole size as close to the size we 156 * if previous extent entry covers the offset,
147 * are asking for so we don't take small slivers out of 157 * we should return it instead of the bitmap entry
148 * huge holes, but we also want to get as close to the
149 * offset as possible so we don't have a whole lot of
150 * fragmentation.
151 */ 158 */
152 if (offset <= entry->offset) { 159 n = &entry->offset_index;
153 if (!ret) 160 while (1) {
154 ret = entry; 161 n = rb_prev(n);
155 else if (entry->bytes < ret->bytes) 162 if (!n)
156 ret = entry; 163 break;
157 else if (entry->offset < ret->offset) 164 prev = rb_entry(n, struct btrfs_free_space,
158 ret = entry; 165 offset_index);
166 if (!prev->bitmap) {
167 if (prev->offset + prev->bytes > offset)
168 entry = prev;
169 break;
170 }
159 } 171 }
160 n = n->rb_left; 172 }
161 } else if (bytes > entry->bytes) { 173 return entry;
162 n = n->rb_right; 174 }
175
176 if (!prev)
177 return NULL;
178
179 /* find last entry before the 'offset' */
180 entry = prev;
181 if (entry->offset > offset) {
182 n = rb_prev(&entry->offset_index);
183 if (n) {
184 entry = rb_entry(n, struct btrfs_free_space,
185 offset_index);
186 BUG_ON(entry->offset > offset);
163 } else { 187 } else {
164 /* 188 if (fuzzy)
165 * Ok we may have multiple chunks of the wanted size, 189 return entry;
166 * so we don't want to take the first one we find, we 190 else
167 * want to take the one closest to our given offset, so 191 return NULL;
168 * keep searching just in case theres a better match.
169 */
170 n = n->rb_right;
171 if (offset > entry->offset)
172 continue;
173 else if (!ret || entry->offset < ret->offset)
174 ret = entry;
175 } 192 }
176 } 193 }
177 194
178 return ret; 195 if (entry->bitmap) {
196 n = &entry->offset_index;
197 while (1) {
198 n = rb_prev(n);
199 if (!n)
200 break;
201 prev = rb_entry(n, struct btrfs_free_space,
202 offset_index);
203 if (!prev->bitmap) {
204 if (prev->offset + prev->bytes > offset)
205 return prev;
206 break;
207 }
208 }
209 if (entry->offset + BITS_PER_BITMAP *
210 block_group->sectorsize > offset)
211 return entry;
212 } else if (entry->offset + entry->bytes > offset)
213 return entry;
214
215 if (!fuzzy)
216 return NULL;
217
218 while (1) {
219 if (entry->bitmap) {
220 if (entry->offset + BITS_PER_BITMAP *
221 block_group->sectorsize > offset)
222 break;
223 } else {
224 if (entry->offset + entry->bytes > offset)
225 break;
226 }
227
228 n = rb_next(&entry->offset_index);
229 if (!n)
230 return NULL;
231 entry = rb_entry(n, struct btrfs_free_space, offset_index);
232 }
233 return entry;
179} 234}
180 235
181static void unlink_free_space(struct btrfs_block_group_cache *block_group, 236static void unlink_free_space(struct btrfs_block_group_cache *block_group,
182 struct btrfs_free_space *info) 237 struct btrfs_free_space *info)
183{ 238{
184 rb_erase(&info->offset_index, &block_group->free_space_offset); 239 rb_erase(&info->offset_index, &block_group->free_space_offset);
185 rb_erase(&info->bytes_index, &block_group->free_space_bytes); 240 block_group->free_extents--;
186} 241}
187 242
188static int link_free_space(struct btrfs_block_group_cache *block_group, 243static int link_free_space(struct btrfs_block_group_cache *block_group,
@@ -190,17 +245,311 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
190{ 245{
191 int ret = 0; 246 int ret = 0;
192 247
193 248 BUG_ON(!info->bitmap && !info->bytes);
194 BUG_ON(!info->bytes);
195 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 249 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
196 &info->offset_index); 250 &info->offset_index, (info->bitmap != NULL));
197 if (ret) 251 if (ret)
198 return ret; 252 return ret;
199 253
200 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, 254 block_group->free_extents++;
201 &info->bytes_index); 255 return ret;
202 if (ret) 256}
203 return ret; 257
258static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
259{
260 u64 max_bytes, possible_bytes;
261
262 /*
263 * The goal is to keep the total amount of memory used per 1gb of space
264 * at or below 32k, so we need to adjust how much memory we allow to be
265 * used by extent based free space tracking
266 */
267 max_bytes = MAX_CACHE_BYTES_PER_GIG *
268 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
269
270 possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
271 (sizeof(struct btrfs_free_space) *
272 block_group->extents_thresh);
273
274 if (possible_bytes > max_bytes) {
275 int extent_bytes = max_bytes -
276 (block_group->total_bitmaps * PAGE_CACHE_SIZE);
277
278 if (extent_bytes <= 0) {
279 block_group->extents_thresh = 0;
280 return;
281 }
282
283 block_group->extents_thresh = extent_bytes /
284 (sizeof(struct btrfs_free_space));
285 }
286}
287
288static void bitmap_clear_bits(struct btrfs_free_space *info, u64 offset, u64 bytes,
289 u64 sectorsize)
290{
291 unsigned long start, end;
292 unsigned long i;
293
294 start = offset_to_bit(info->offset, sectorsize, offset);
295 end = start + bytes_to_bits(bytes, sectorsize);
296 BUG_ON(end > BITS_PER_BITMAP);
297
298 for (i = start; i < end; i++)
299 clear_bit(i, info->bitmap);
300
301 info->bytes -= bytes;
302}
303
304static void bitmap_set_bits(struct btrfs_free_space *info, u64 offset, u64 bytes,
305 u64 sectorsize)
306{
307 unsigned long start, end;
308 unsigned long i;
309
310 start = offset_to_bit(info->offset, sectorsize, offset);
311 end = start + bytes_to_bits(bytes, sectorsize);
312 BUG_ON(end > BITS_PER_BITMAP);
313
314 for (i = start; i < end; i++)
315 set_bit(i, info->bitmap);
316
317 info->bytes += bytes;
318}
319
320static int search_bitmap(struct btrfs_block_group_cache *block_group,
321 struct btrfs_free_space *bitmap_info, u64 *offset,
322 u64 *bytes)
323{
324 unsigned long found_bits = 0;
325 unsigned long bits, i;
326 unsigned long next_zero;
327
328 i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
329 max_t(u64, *offset, bitmap_info->offset));
330 bits = bytes_to_bits(*bytes, block_group->sectorsize);
331
332 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
333 i < BITS_PER_BITMAP;
334 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
335 next_zero = find_next_zero_bit(bitmap_info->bitmap,
336 BITS_PER_BITMAP, i);
337 if ((next_zero - i) >= bits) {
338 found_bits = next_zero - i;
339 break;
340 }
341 i = next_zero;
342 }
343
344 if (found_bits) {
345 *offset = (u64)(i * block_group->sectorsize) +
346 bitmap_info->offset;
347 *bytes = (u64)(found_bits) * block_group->sectorsize;
348 return 0;
349 }
350
351 return -1;
352}
353
354static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
355 *block_group, u64 *offset,
356 u64 *bytes, int debug)
357{
358 struct btrfs_free_space *entry;
359 struct rb_node *node;
360 int ret;
361
362 if (!block_group->free_space_offset.rb_node)
363 return NULL;
364
365 entry = tree_search_offset(block_group,
366 offset_to_bitmap(block_group, *offset),
367 0, 1);
368 if (!entry)
369 return NULL;
370
371 for (node = &entry->offset_index; node; node = rb_next(node)) {
372 entry = rb_entry(node, struct btrfs_free_space, offset_index);
373 if (entry->bytes < *bytes)
374 continue;
375
376 if (entry->bitmap) {
377 ret = search_bitmap(block_group, entry, offset, bytes);
378 if (!ret)
379 return entry;
380 continue;
381 }
382
383 *offset = entry->offset;
384 *bytes = entry->bytes;
385 return entry;
386 }
387
388 return NULL;
389}
390
391static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
392 struct btrfs_free_space *info, u64 offset)
393{
394 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
395 int max_bitmaps = (int)div64_u64(block_group->key.offset +
396 bytes_per_bg - 1, bytes_per_bg);
397 BUG_ON(block_group->total_bitmaps >= max_bitmaps);
398
399 info->offset = offset_to_bitmap(block_group, offset);
400 link_free_space(block_group, info);
401 block_group->total_bitmaps++;
402
403 recalculate_thresholds(block_group);
404}
405
406static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
407 struct btrfs_free_space *bitmap_info,
408 u64 *offset, u64 *bytes)
409{
410 u64 end;
411
412again:
413 end = bitmap_info->offset +
414 (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
415
416 if (*offset > bitmap_info->offset && *offset + *bytes > end) {
417 bitmap_clear_bits(bitmap_info, *offset,
418 end - *offset + 1, block_group->sectorsize);
419 *bytes -= end - *offset + 1;
420 *offset = end + 1;
421 } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
422 bitmap_clear_bits(bitmap_info, *offset,
423 *bytes, block_group->sectorsize);
424 *bytes = 0;
425 }
426
427 if (*bytes) {
428 if (!bitmap_info->bytes) {
429 unlink_free_space(block_group, bitmap_info);
430 kfree(bitmap_info->bitmap);
431 kfree(bitmap_info);
432 block_group->total_bitmaps--;
433 recalculate_thresholds(block_group);
434 }
435
436 bitmap_info = tree_search_offset(block_group,
437 offset_to_bitmap(block_group,
438 *offset),
439 1, 0);
440 if (!bitmap_info)
441 return -EINVAL;
442
443 if (!bitmap_info->bitmap)
444 return -EAGAIN;
445
446 goto again;
447 } else if (!bitmap_info->bytes) {
448 unlink_free_space(block_group, bitmap_info);
449 kfree(bitmap_info->bitmap);
450 kfree(bitmap_info);
451 block_group->total_bitmaps--;
452 recalculate_thresholds(block_group);
453 }
454
455 return 0;
456}
457
458static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
459 struct btrfs_free_space *info)
460{
461 struct btrfs_free_space *bitmap_info;
462 int added = 0;
463 u64 bytes, offset, end;
464 int ret;
465
466 /*
467 * If we are below the extents threshold then we can add this as an
468 * extent, and don't have to deal with the bitmap
469 */
470 if (block_group->free_extents < block_group->extents_thresh &&
471 info->bytes > block_group->sectorsize * 4)
472 return 0;
473
474 /*
475 * some block groups are so tiny they can't be enveloped by a bitmap, so
476 * don't even bother to create a bitmap for this
477 */
478 if (BITS_PER_BITMAP * block_group->sectorsize >
479 block_group->key.offset)
480 return 0;
481
482 bytes = info->bytes;
483 offset = info->offset;
484
485again:
486 bitmap_info = tree_search_offset(block_group,
487 offset_to_bitmap(block_group, offset),
488 1, 0);
489 if (!bitmap_info) {
490 BUG_ON(added);
491 goto new_bitmap;
492 }
493
494 end = bitmap_info->offset +
495 (u64)(BITS_PER_BITMAP * block_group->sectorsize);
496
497 if (offset >= bitmap_info->offset && offset + bytes > end) {
498 bitmap_set_bits(bitmap_info, offset, end - offset,
499 block_group->sectorsize);
500 bytes -= end - offset;
501 offset = end;
502 added = 0;
503 } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
504 bitmap_set_bits(bitmap_info, offset, bytes,
505 block_group->sectorsize);
506 bytes = 0;
507 } else {
508 BUG();
509 }
510
511 if (!bytes) {
512 ret = 1;
513 goto out;
514 } else
515 goto again;
516
517new_bitmap:
518 if (info && info->bitmap) {
519 add_new_bitmap(block_group, info, offset);
520 added = 1;
521 info = NULL;
522 goto again;
523 } else {
524 spin_unlock(&block_group->tree_lock);
525
526 /* no pre-allocated info, allocate a new one */
527 if (!info) {
528 info = kzalloc(sizeof(struct btrfs_free_space),
529 GFP_NOFS);
530 if (!info) {
531 spin_lock(&block_group->tree_lock);
532 ret = -ENOMEM;
533 goto out;
534 }
535 }
536
537 /* allocate the bitmap */
538 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
539 spin_lock(&block_group->tree_lock);
540 if (!info->bitmap) {
541 ret = -ENOMEM;
542 goto out;
543 }
544 goto again;
545 }
546
547out:
548 if (info) {
549 if (info->bitmap)
550 kfree(info->bitmap);
551 kfree(info);
552 }
204 553
205 return ret; 554 return ret;
206} 555}
@@ -208,8 +557,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
208int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 557int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
209 u64 offset, u64 bytes) 558 u64 offset, u64 bytes)
210{ 559{
211 struct btrfs_free_space *right_info; 560 struct btrfs_free_space *right_info = NULL;
212 struct btrfs_free_space *left_info; 561 struct btrfs_free_space *left_info = NULL;
213 struct btrfs_free_space *info = NULL; 562 struct btrfs_free_space *info = NULL;
214 int ret = 0; 563 int ret = 0;
215 564
@@ -227,18 +576,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
227 * are adding, if there is remove that struct and add a new one to 576 * are adding, if there is remove that struct and add a new one to
228 * cover the entire range 577 * cover the entire range
229 */ 578 */
230 right_info = tree_search_offset(&block_group->free_space_offset, 579 right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
231 offset+bytes, 0, 0); 580 if (right_info && rb_prev(&right_info->offset_index))
232 left_info = tree_search_offset(&block_group->free_space_offset, 581 left_info = rb_entry(rb_prev(&right_info->offset_index),
233 offset-1, 0, 1); 582 struct btrfs_free_space, offset_index);
583 else
584 left_info = tree_search_offset(block_group, offset - 1, 0, 0);
234 585
235 if (right_info) { 586 /*
587 * If there was no extent directly to the left or right of this new
588 * extent then we know we're going to have to allocate a new extent, so
589 * before we do that see if we need to drop this into a bitmap
590 */
591 if ((!left_info || left_info->bitmap) &&
592 (!right_info || right_info->bitmap)) {
593 ret = insert_into_bitmap(block_group, info);
594
595 if (ret < 0) {
596 goto out;
597 } else if (ret) {
598 ret = 0;
599 goto out;
600 }
601 }
602
603 if (right_info && !right_info->bitmap) {
236 unlink_free_space(block_group, right_info); 604 unlink_free_space(block_group, right_info);
237 info->bytes += right_info->bytes; 605 info->bytes += right_info->bytes;
238 kfree(right_info); 606 kfree(right_info);
239 } 607 }
240 608
241 if (left_info && left_info->offset + left_info->bytes == offset) { 609 if (left_info && !left_info->bitmap &&
610 left_info->offset + left_info->bytes == offset) {
242 unlink_free_space(block_group, left_info); 611 unlink_free_space(block_group, left_info);
243 info->offset = left_info->offset; 612 info->offset = left_info->offset;
244 info->bytes += left_info->bytes; 613 info->bytes += left_info->bytes;
@@ -248,11 +617,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
248 ret = link_free_space(block_group, info); 617 ret = link_free_space(block_group, info);
249 if (ret) 618 if (ret)
250 kfree(info); 619 kfree(info);
251 620out:
252 spin_unlock(&block_group->tree_lock); 621 spin_unlock(&block_group->tree_lock);
253 622
254 if (ret) { 623 if (ret) {
255 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); 624 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
256 BUG_ON(ret == -EEXIST); 625 BUG_ON(ret == -EEXIST);
257 } 626 }
258 627
@@ -263,40 +632,65 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
263 u64 offset, u64 bytes) 632 u64 offset, u64 bytes)
264{ 633{
265 struct btrfs_free_space *info; 634 struct btrfs_free_space *info;
635 struct btrfs_free_space *next_info = NULL;
266 int ret = 0; 636 int ret = 0;
267 637
268 spin_lock(&block_group->tree_lock); 638 spin_lock(&block_group->tree_lock);
269 639
270 info = tree_search_offset(&block_group->free_space_offset, offset, 0, 640again:
271 1); 641 info = tree_search_offset(block_group, offset, 0, 0);
272 if (info && info->offset == offset) { 642 if (!info) {
273 if (info->bytes < bytes) { 643 WARN_ON(1);
274 printk(KERN_ERR "Found free space at %llu, size %llu," 644 goto out_lock;
275 "trying to use %llu\n", 645 }
276 (unsigned long long)info->offset, 646
277 (unsigned long long)info->bytes, 647 if (info->bytes < bytes && rb_next(&info->offset_index)) {
278 (unsigned long long)bytes); 648 u64 end;
649 next_info = rb_entry(rb_next(&info->offset_index),
650 struct btrfs_free_space,
651 offset_index);
652
653 if (next_info->bitmap)
654 end = next_info->offset + BITS_PER_BITMAP *
655 block_group->sectorsize - 1;
656 else
657 end = next_info->offset + next_info->bytes;
658
659 if (next_info->bytes < bytes ||
660 next_info->offset > offset || offset > end) {
661 printk(KERN_CRIT "Found free space at %llu, size %llu,"
662 " trying to use %llu\n",
663 (unsigned long long)info->offset,
664 (unsigned long long)info->bytes,
665 (unsigned long long)bytes);
279 WARN_ON(1); 666 WARN_ON(1);
280 ret = -EINVAL; 667 ret = -EINVAL;
281 spin_unlock(&block_group->tree_lock); 668 goto out_lock;
282 goto out;
283 } 669 }
284 unlink_free_space(block_group, info);
285 670
286 if (info->bytes == bytes) { 671 info = next_info;
287 kfree(info); 672 }
288 spin_unlock(&block_group->tree_lock); 673
289 goto out; 674 if (info->bytes == bytes) {
675 unlink_free_space(block_group, info);
676 if (info->bitmap) {
677 kfree(info->bitmap);
678 block_group->total_bitmaps--;
290 } 679 }
680 kfree(info);
681 goto out_lock;
682 }
291 683
684 if (!info->bitmap && info->offset == offset) {
685 unlink_free_space(block_group, info);
292 info->offset += bytes; 686 info->offset += bytes;
293 info->bytes -= bytes; 687 info->bytes -= bytes;
688 link_free_space(block_group, info);
689 goto out_lock;
690 }
294 691
295 ret = link_free_space(block_group, info); 692 if (!info->bitmap && info->offset <= offset &&
296 spin_unlock(&block_group->tree_lock); 693 info->offset + info->bytes >= offset + bytes) {
297 BUG_ON(ret);
298 } else if (info && info->offset < offset &&
299 info->offset + info->bytes >= offset + bytes) {
300 u64 old_start = info->offset; 694 u64 old_start = info->offset;
301 /* 695 /*
302 * we're freeing space in the middle of the info, 696 * we're freeing space in the middle of the info,
@@ -312,7 +706,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
312 info->offset = offset + bytes; 706 info->offset = offset + bytes;
313 info->bytes = old_end - info->offset; 707 info->bytes = old_end - info->offset;
314 ret = link_free_space(block_group, info); 708 ret = link_free_space(block_group, info);
315 BUG_ON(ret); 709 WARN_ON(ret);
710 if (ret)
711 goto out_lock;
316 } else { 712 } else {
317 /* the hole we're creating ends at the end 713 /* the hole we're creating ends at the end
318 * of the info struct, just free the info 714 * of the info struct, just free the info
@@ -320,32 +716,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
320 kfree(info); 716 kfree(info);
321 } 717 }
322 spin_unlock(&block_group->tree_lock); 718 spin_unlock(&block_group->tree_lock);
323 /* step two, insert a new info struct to cover anything 719
324 * before the hole 720 /* step two, insert a new info struct to cover
721 * anything before the hole
325 */ 722 */
326 ret = btrfs_add_free_space(block_group, old_start, 723 ret = btrfs_add_free_space(block_group, old_start,
327 offset - old_start); 724 offset - old_start);
328 BUG_ON(ret); 725 WARN_ON(ret);
329 } else { 726 goto out;
330 spin_unlock(&block_group->tree_lock);
331 if (!info) {
332 printk(KERN_ERR "couldn't find space %llu to free\n",
333 (unsigned long long)offset);
334 printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
335 block_group->cached,
336 (unsigned long long)block_group->key.objectid,
337 (unsigned long long)block_group->key.offset);
338 btrfs_dump_free_space(block_group, bytes);
339 } else if (info) {
340 printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
341 "but wanted offset=%llu bytes=%llu\n",
342 (unsigned long long)info->offset,
343 (unsigned long long)info->bytes,
344 (unsigned long long)offset,
345 (unsigned long long)bytes);
346 }
347 WARN_ON(1);
348 } 727 }
728
729 ret = remove_from_bitmap(block_group, info, &offset, &bytes);
730 if (ret == -EAGAIN)
731 goto again;
732 BUG_ON(ret);
733out_lock:
734 spin_unlock(&block_group->tree_lock);
349out: 735out:
350 return ret; 736 return ret;
351} 737}
@@ -361,10 +747,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
361 info = rb_entry(n, struct btrfs_free_space, offset_index); 747 info = rb_entry(n, struct btrfs_free_space, offset_index);
362 if (info->bytes >= bytes) 748 if (info->bytes >= bytes)
363 count++; 749 count++;
364 printk(KERN_ERR "entry offset %llu, bytes %llu\n", 750 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
365 (unsigned long long)info->offset, 751 (unsigned long long)info->offset,
366 (unsigned long long)info->bytes); 752 (unsigned long long)info->bytes,
753 (info->bitmap) ? "yes" : "no");
367 } 754 }
755 printk(KERN_INFO "block group has cluster?: %s\n",
756 list_empty(&block_group->cluster_list) ? "no" : "yes");
368 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 757 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
369 "\n", count); 758 "\n", count);
370} 759}
@@ -397,26 +786,35 @@ __btrfs_return_cluster_to_free_space(
397{ 786{
398 struct btrfs_free_space *entry; 787 struct btrfs_free_space *entry;
399 struct rb_node *node; 788 struct rb_node *node;
789 bool bitmap;
400 790
401 spin_lock(&cluster->lock); 791 spin_lock(&cluster->lock);
402 if (cluster->block_group != block_group) 792 if (cluster->block_group != block_group)
403 goto out; 793 goto out;
404 794
795 bitmap = cluster->points_to_bitmap;
796 cluster->block_group = NULL;
405 cluster->window_start = 0; 797 cluster->window_start = 0;
798 list_del_init(&cluster->block_group_list);
799 cluster->points_to_bitmap = false;
800
801 if (bitmap)
802 goto out;
803
406 node = rb_first(&cluster->root); 804 node = rb_first(&cluster->root);
407 while(node) { 805 while (node) {
408 entry = rb_entry(node, struct btrfs_free_space, offset_index); 806 entry = rb_entry(node, struct btrfs_free_space, offset_index);
409 node = rb_next(&entry->offset_index); 807 node = rb_next(&entry->offset_index);
410 rb_erase(&entry->offset_index, &cluster->root); 808 rb_erase(&entry->offset_index, &cluster->root);
411 link_free_space(block_group, entry); 809 BUG_ON(entry->bitmap);
810 tree_insert_offset(&block_group->free_space_offset,
811 entry->offset, &entry->offset_index, 0);
412 } 812 }
413 list_del_init(&cluster->block_group_list);
414
415 btrfs_put_block_group(cluster->block_group);
416 cluster->block_group = NULL;
417 cluster->root.rb_node = NULL; 813 cluster->root.rb_node = NULL;
814
418out: 815out:
419 spin_unlock(&cluster->lock); 816 spin_unlock(&cluster->lock);
817 btrfs_put_block_group(block_group);
420 return 0; 818 return 0;
421} 819}
422 820
@@ -425,20 +823,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
425 struct btrfs_free_space *info; 823 struct btrfs_free_space *info;
426 struct rb_node *node; 824 struct rb_node *node;
427 struct btrfs_free_cluster *cluster; 825 struct btrfs_free_cluster *cluster;
428 struct btrfs_free_cluster *safe; 826 struct list_head *head;
429 827
430 spin_lock(&block_group->tree_lock); 828 spin_lock(&block_group->tree_lock);
431 829 while ((head = block_group->cluster_list.next) !=
432 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, 830 &block_group->cluster_list) {
433 block_group_list) { 831 cluster = list_entry(head, struct btrfs_free_cluster,
832 block_group_list);
434 833
435 WARN_ON(cluster->block_group != block_group); 834 WARN_ON(cluster->block_group != block_group);
436 __btrfs_return_cluster_to_free_space(block_group, cluster); 835 __btrfs_return_cluster_to_free_space(block_group, cluster);
836 if (need_resched()) {
837 spin_unlock(&block_group->tree_lock);
838 cond_resched();
839 spin_lock(&block_group->tree_lock);
840 }
437 } 841 }
438 842
439 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 843 while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
440 info = rb_entry(node, struct btrfs_free_space, bytes_index); 844 info = rb_entry(node, struct btrfs_free_space, offset_index);
441 unlink_free_space(block_group, info); 845 unlink_free_space(block_group, info);
846 if (info->bitmap)
847 kfree(info->bitmap);
442 kfree(info); 848 kfree(info);
443 if (need_resched()) { 849 if (need_resched()) {
444 spin_unlock(&block_group->tree_lock); 850 spin_unlock(&block_group->tree_lock);
@@ -446,6 +852,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
446 spin_lock(&block_group->tree_lock); 852 spin_lock(&block_group->tree_lock);
447 } 853 }
448 } 854 }
855
449 spin_unlock(&block_group->tree_lock); 856 spin_unlock(&block_group->tree_lock);
450} 857}
451 858
@@ -453,27 +860,37 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
453 u64 offset, u64 bytes, u64 empty_size) 860 u64 offset, u64 bytes, u64 empty_size)
454{ 861{
455 struct btrfs_free_space *entry = NULL; 862 struct btrfs_free_space *entry = NULL;
863 u64 bytes_search = bytes + empty_size;
456 u64 ret = 0; 864 u64 ret = 0;
457 865
458 spin_lock(&block_group->tree_lock); 866 spin_lock(&block_group->tree_lock);
459 entry = tree_search_offset(&block_group->free_space_offset, offset, 867 entry = find_free_space(block_group, &offset, &bytes_search, 0);
460 bytes + empty_size, 1);
461 if (!entry) 868 if (!entry)
462 entry = tree_search_bytes(&block_group->free_space_bytes, 869 goto out;
463 offset, bytes + empty_size); 870
464 if (entry) { 871 ret = offset;
872 if (entry->bitmap) {
873 bitmap_clear_bits(entry, offset, bytes,
874 block_group->sectorsize);
875 if (!entry->bytes) {
876 unlink_free_space(block_group, entry);
877 kfree(entry->bitmap);
878 kfree(entry);
879 block_group->total_bitmaps--;
880 recalculate_thresholds(block_group);
881 }
882 } else {
465 unlink_free_space(block_group, entry); 883 unlink_free_space(block_group, entry);
466 ret = entry->offset;
467 entry->offset += bytes; 884 entry->offset += bytes;
468 entry->bytes -= bytes; 885 entry->bytes -= bytes;
469
470 if (!entry->bytes) 886 if (!entry->bytes)
471 kfree(entry); 887 kfree(entry);
472 else 888 else
473 link_free_space(block_group, entry); 889 link_free_space(block_group, entry);
474 } 890 }
475 spin_unlock(&block_group->tree_lock);
476 891
892out:
893 spin_unlock(&block_group->tree_lock);
477 return ret; 894 return ret;
478} 895}
479 896
@@ -517,6 +934,47 @@ int btrfs_return_cluster_to_free_space(
517 return ret; 934 return ret;
518} 935}
519 936
937static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
938 struct btrfs_free_cluster *cluster,
939 u64 bytes, u64 min_start)
940{
941 struct btrfs_free_space *entry;
942 int err;
943 u64 search_start = cluster->window_start;
944 u64 search_bytes = bytes;
945 u64 ret = 0;
946
947 spin_lock(&block_group->tree_lock);
948 spin_lock(&cluster->lock);
949
950 if (!cluster->points_to_bitmap)
951 goto out;
952
953 if (cluster->block_group != block_group)
954 goto out;
955
956 entry = tree_search_offset(block_group, search_start, 0, 0);
957
958 if (!entry || !entry->bitmap)
959 goto out;
960
961 search_start = min_start;
962 search_bytes = bytes;
963
964 err = search_bitmap(block_group, entry, &search_start,
965 &search_bytes);
966 if (err)
967 goto out;
968
969 ret = search_start;
970 bitmap_clear_bits(entry, ret, bytes, block_group->sectorsize);
971out:
972 spin_unlock(&cluster->lock);
973 spin_unlock(&block_group->tree_lock);
974
975 return ret;
976}
977
520/* 978/*
521 * given a cluster, try to allocate 'bytes' from it, returns 0 979 * given a cluster, try to allocate 'bytes' from it, returns 0
522 * if it couldn't find anything suitably large, or a logical disk offset 980 * if it couldn't find anything suitably large, or a logical disk offset
@@ -530,6 +988,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
530 struct rb_node *node; 988 struct rb_node *node;
531 u64 ret = 0; 989 u64 ret = 0;
532 990
991 if (cluster->points_to_bitmap)
992 return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
993 min_start);
994
533 spin_lock(&cluster->lock); 995 spin_lock(&cluster->lock);
534 if (bytes > cluster->max_size) 996 if (bytes > cluster->max_size)
535 goto out; 997 goto out;
@@ -567,9 +1029,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
567 } 1029 }
568out: 1030out:
569 spin_unlock(&cluster->lock); 1031 spin_unlock(&cluster->lock);
1032
570 return ret; 1033 return ret;
571} 1034}
572 1035
1036static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1037 struct btrfs_free_space *entry,
1038 struct btrfs_free_cluster *cluster,
1039 u64 offset, u64 bytes, u64 min_bytes)
1040{
1041 unsigned long next_zero;
1042 unsigned long i;
1043 unsigned long search_bits;
1044 unsigned long total_bits;
1045 unsigned long found_bits;
1046 unsigned long start = 0;
1047 unsigned long total_found = 0;
1048 bool found = false;
1049
1050 i = offset_to_bit(entry->offset, block_group->sectorsize,
1051 max_t(u64, offset, entry->offset));
1052 search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
1053 total_bits = bytes_to_bits(bytes, block_group->sectorsize);
1054
1055again:
1056 found_bits = 0;
1057 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
1058 i < BITS_PER_BITMAP;
1059 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
1060 next_zero = find_next_zero_bit(entry->bitmap,
1061 BITS_PER_BITMAP, i);
1062 if (next_zero - i >= search_bits) {
1063 found_bits = next_zero - i;
1064 break;
1065 }
1066 i = next_zero;
1067 }
1068
1069 if (!found_bits)
1070 return -1;
1071
1072 if (!found) {
1073 start = i;
1074 found = true;
1075 }
1076
1077 total_found += found_bits;
1078
1079 if (cluster->max_size < found_bits * block_group->sectorsize)
1080 cluster->max_size = found_bits * block_group->sectorsize;
1081
1082 if (total_found < total_bits) {
1083 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
1084 if (i - start > total_bits * 2) {
1085 total_found = 0;
1086 cluster->max_size = 0;
1087 found = false;
1088 }
1089 goto again;
1090 }
1091
1092 cluster->window_start = start * block_group->sectorsize +
1093 entry->offset;
1094 cluster->points_to_bitmap = true;
1095
1096 return 0;
1097}
1098
573/* 1099/*
574 * here we try to find a cluster of blocks in a block group. The goal 1100 * here we try to find a cluster of blocks in a block group. The goal
575 * is to find at least bytes free and up to empty_size + bytes free. 1101 * is to find at least bytes free and up to empty_size + bytes free.
@@ -587,12 +1113,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
587 struct btrfs_free_space *entry = NULL; 1113 struct btrfs_free_space *entry = NULL;
588 struct rb_node *node; 1114 struct rb_node *node;
589 struct btrfs_free_space *next; 1115 struct btrfs_free_space *next;
590 struct btrfs_free_space *last; 1116 struct btrfs_free_space *last = NULL;
591 u64 min_bytes; 1117 u64 min_bytes;
592 u64 window_start; 1118 u64 window_start;
593 u64 window_free; 1119 u64 window_free;
594 u64 max_extent = 0; 1120 u64 max_extent = 0;
595 int total_retries = 0; 1121 bool found_bitmap = false;
596 int ret; 1122 int ret;
597 1123
598 /* for metadata, allow allocates with more holes */ 1124 /* for metadata, allow allocates with more holes */
@@ -620,31 +1146,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
620 goto out; 1146 goto out;
621 } 1147 }
622again: 1148again:
623 min_bytes = min(min_bytes, bytes + empty_size); 1149 entry = tree_search_offset(block_group, offset, found_bitmap, 1);
624 entry = tree_search_bytes(&block_group->free_space_bytes,
625 offset, min_bytes);
626 if (!entry) { 1150 if (!entry) {
627 ret = -ENOSPC; 1151 ret = -ENOSPC;
628 goto out; 1152 goto out;
629 } 1153 }
1154
1155 /*
1156 * If found_bitmap is true, we exhausted our search for extent entries,
1157 * and we just want to search all of the bitmaps that we can find, and
1158 * ignore any extent entries we find.
1159 */
1160 while (entry->bitmap || found_bitmap ||
1161 (!entry->bitmap && entry->bytes < min_bytes)) {
1162 struct rb_node *node = rb_next(&entry->offset_index);
1163
1164 if (entry->bitmap && entry->bytes > bytes + empty_size) {
1165 ret = btrfs_bitmap_cluster(block_group, entry, cluster,
1166 offset, bytes + empty_size,
1167 min_bytes);
1168 if (!ret)
1169 goto got_it;
1170 }
1171
1172 if (!node) {
1173 ret = -ENOSPC;
1174 goto out;
1175 }
1176 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1177 }
1178
1179 /*
1180 * We already searched all the extent entries from the passed in offset
1181 * to the end and didn't find enough space for the cluster, and we also
1182 * didn't find any bitmaps that met our criteria, just go ahead and exit
1183 */
1184 if (found_bitmap) {
1185 ret = -ENOSPC;
1186 goto out;
1187 }
1188
1189 cluster->points_to_bitmap = false;
630 window_start = entry->offset; 1190 window_start = entry->offset;
631 window_free = entry->bytes; 1191 window_free = entry->bytes;
632 last = entry; 1192 last = entry;
633 max_extent = entry->bytes; 1193 max_extent = entry->bytes;
634 1194
635 while(1) { 1195 while (1) {
636 /* out window is just right, lets fill it */ 1196 /* out window is just right, lets fill it */
637 if (window_free >= bytes + empty_size) 1197 if (window_free >= bytes + empty_size)
638 break; 1198 break;
639 1199
640 node = rb_next(&last->offset_index); 1200 node = rb_next(&last->offset_index);
641 if (!node) { 1201 if (!node) {
1202 if (found_bitmap)
1203 goto again;
642 ret = -ENOSPC; 1204 ret = -ENOSPC;
643 goto out; 1205 goto out;
644 } 1206 }
645 next = rb_entry(node, struct btrfs_free_space, offset_index); 1207 next = rb_entry(node, struct btrfs_free_space, offset_index);
646 1208
647 /* 1209 /*
1210 * we found a bitmap, so if this search doesn't result in a
1211 * cluster, we know to go and search again for the bitmaps and
1212 * start looking for space there
1213 */
1214 if (next->bitmap) {
1215 if (!found_bitmap)
1216 offset = next->offset;
1217 found_bitmap = true;
1218 last = next;
1219 continue;
1220 }
1221
1222 /*
648 * we haven't filled the empty size and the window is 1223 * we haven't filled the empty size and the window is
649 * very large. reset and try again 1224 * very large. reset and try again
650 */ 1225 */
@@ -655,19 +1230,6 @@ again:
655 window_free = entry->bytes; 1230 window_free = entry->bytes;
656 last = entry; 1231 last = entry;
657 max_extent = 0; 1232 max_extent = 0;
658 total_retries++;
659 if (total_retries % 64 == 0) {
660 if (min_bytes >= (bytes + empty_size)) {
661 ret = -ENOSPC;
662 goto out;
663 }
664 /*
665 * grow our allocation a bit, we're not having
666 * much luck
667 */
668 min_bytes *= 2;
669 goto again;
670 }
671 } else { 1233 } else {
672 last = next; 1234 last = next;
673 window_free += next->bytes; 1235 window_free += next->bytes;
@@ -685,11 +1247,19 @@ again:
685 * The cluster includes an rbtree, but only uses the offset index 1247 * The cluster includes an rbtree, but only uses the offset index
686 * of each free space cache entry. 1248 * of each free space cache entry.
687 */ 1249 */
688 while(1) { 1250 while (1) {
689 node = rb_next(&entry->offset_index); 1251 node = rb_next(&entry->offset_index);
690 unlink_free_space(block_group, entry); 1252 if (entry->bitmap && node) {
1253 entry = rb_entry(node, struct btrfs_free_space,
1254 offset_index);
1255 continue;
1256 } else if (entry->bitmap && !node) {
1257 break;
1258 }
1259
1260 rb_erase(&entry->offset_index, &block_group->free_space_offset);
691 ret = tree_insert_offset(&cluster->root, entry->offset, 1261 ret = tree_insert_offset(&cluster->root, entry->offset,
692 &entry->offset_index); 1262 &entry->offset_index, 0);
693 BUG_ON(ret); 1263 BUG_ON(ret);
694 1264
695 if (!node || entry == last) 1265 if (!node || entry == last)
@@ -697,8 +1267,10 @@ again:
697 1267
698 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1268 entry = rb_entry(node, struct btrfs_free_space, offset_index);
699 } 1269 }
700 ret = 0; 1270
701 cluster->max_size = max_extent; 1271 cluster->max_size = max_extent;
1272got_it:
1273 ret = 0;
702 atomic_inc(&block_group->count); 1274 atomic_inc(&block_group->count);
703 list_add_tail(&cluster->block_group_list, &block_group->cluster_list); 1275 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
704 cluster->block_group = block_group; 1276 cluster->block_group = block_group;
@@ -718,6 +1290,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
718 spin_lock_init(&cluster->refill_lock); 1290 spin_lock_init(&cluster->refill_lock);
719 cluster->root.rb_node = NULL; 1291 cluster->root.rb_node = NULL;
720 cluster->max_size = 0; 1292 cluster->max_size = 0;
1293 cluster->points_to_bitmap = false;
721 INIT_LIST_HEAD(&cluster->block_group_list); 1294 INIT_LIST_HEAD(&cluster->block_group_list);
722 cluster->block_group = NULL; 1295 cluster->block_group = NULL;
723} 1296}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 266fb8764054..890a8e79011b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -19,6 +19,14 @@
19#ifndef __BTRFS_FREE_SPACE_CACHE 19#ifndef __BTRFS_FREE_SPACE_CACHE
20#define __BTRFS_FREE_SPACE_CACHE 20#define __BTRFS_FREE_SPACE_CACHE
21 21
22struct btrfs_free_space {
23 struct rb_node offset_index;
24 u64 offset;
25 u64 bytes;
26 unsigned long *bitmap;
27 struct list_head list;
28};
29
22int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 30int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
23 u64 bytenr, u64 size); 31 u64 bytenr, u64 size);
24int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 32int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,