aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/free-space-cache.c
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@redhat.com>2008-09-23 13:14:11 -0400
committerChris Mason <chris.mason@oracle.com>2008-09-25 11:04:07 -0400
commit0f9dd46cda36b8de3b9f48bc42bd09d20b9c3b52 (patch)
tree2dcba11fd2fb5a4227fd8f8d2d076641f115a7b4 /fs/btrfs/free-space-cache.c
parentef8bbdfe7e12dc9b4e80756f6d606c4639c65851 (diff)
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two rb-trees per block group to track free space areas via offset and size. The reason to do this is because most allocations come with a hint byte where to start, so we can usually find a chunk of free space at that hint byte to satisfy the allocation and get good space packing. If we cannot find free space at or after the given offset we fall back on looking for a chunk of the given size as close to that given offset as possible. When we fall back on the size search we also try to find a slot as close to the size we want as possible, to avoid breaking small chunks off of huge areas if possible. 2) remove the extent_io_tree that tracked the block group cache from fs_info and replaced it with an rb-tree thats tracks block group cache via offset. also added a per space_info list that tracks the block group cache for the particular space so we can lookup related block groups easily. 3) cleaned up the allocation code to make it a little easier to read and a little less complicated. Basically there are 3 steps, first look from our provided hint. If we couldn't find from that given hint, start back at our original search start and look for space from there. If that fails try to allocate space if we can and start looking again. If not we're screwed and need to start over again. 4) small fixes. there were some issues in volumes.c where we wouldn't allocate the rest of the disk. fixed cow_file_range to actually pass the alloc_hint, which has helped a good bit in making the fs_mark test I run have semi-normal results as we run out of space. Generally with data allocations we don't track where we last allocated from, so everytime we did a data allocation we'd search through every block group that we have looking for free space. Now searching a block group with no free space isn't terribly time consuming, it was causing a slight degradation as we got more data block groups. The alloc_hint has fixed this slight degredation and made things semi-normal. There is still one nagging problem I'm working on where we will get ENOSPC when there is definitely plenty of space. This only happens with metadata allocations, and only when we are almost full. So you generally hit the 85% mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm still tracking it down, but until then this seems to be pretty stable and make a significant performance gain. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/free-space-cache.c')
-rw-r--r--fs/btrfs/free-space-cache.c415
1 files changed, 415 insertions, 0 deletions
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000000..01c26e8ae555
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,415 @@
1/*
2 * Copyright (C) 2008 Red Hat. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include "ctree.h"
21
22static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node)
24{
25 struct rb_node **p = &root->rb_node;
26 struct rb_node *parent = NULL;
27 struct btrfs_free_space *info;
28
29 while (*p) {
30 parent = *p;
31 info = rb_entry(parent, struct btrfs_free_space, offset_index);
32
33 if (offset < info->offset)
34 p = &(*p)->rb_left;
35 else if (offset > info->offset)
36 p = &(*p)->rb_right;
37 else
38 return -EEXIST;
39 }
40
41 rb_link_node(node, parent, p);
42 rb_insert_color(node, root);
43
44 return 0;
45}
46
47static int tree_insert_bytes(struct rb_root *root, u64 bytes,
48 struct rb_node *node)
49{
50 struct rb_node **p = &root->rb_node;
51 struct rb_node *parent = NULL;
52 struct btrfs_free_space *info;
53
54 while (*p) {
55 parent = *p;
56 info = rb_entry(parent, struct btrfs_free_space, bytes_index);
57
58 if (bytes < info->bytes)
59 p = &(*p)->rb_left;
60 else
61 p = &(*p)->rb_right;
62 }
63
64 rb_link_node(node, parent, p);
65 rb_insert_color(node, root);
66
67 return 0;
68}
69
70/*
71 * searches the tree for the given offset. If contains is set we will return
72 * the free space that contains the given offset. If contains is not set we
73 * will return the free space that starts at or after the given offset and is
74 * at least bytes long.
75 */
76static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
77 u64 offset, u64 bytes,
78 int contains)
79{
80 struct rb_node *n = root->rb_node;
81 struct btrfs_free_space *entry, *ret = NULL;
82
83 while (n) {
84 entry = rb_entry(n, struct btrfs_free_space, offset_index);
85
86 if (offset < entry->offset) {
87 if (!contains &&
88 (!ret || entry->offset < ret->offset) &&
89 (bytes <= entry->bytes))
90 ret = entry;
91 n = n->rb_left;
92 } else if (offset > entry->offset) {
93 if (contains &&
94 (entry->offset + entry->bytes - 1) >= offset) {
95 ret = entry;
96 break;
97 }
98 n = n->rb_right;
99 } else {
100 if (bytes > entry->bytes) {
101 n = n->rb_right;
102 continue;
103 }
104 ret = entry;
105 break;
106 }
107 }
108
109 return ret;
110}
111
112/*
113 * return a chunk at least bytes size, as close to offset that we can get.
114 */
115static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
116 u64 offset, u64 bytes)
117{
118 struct rb_node *n = root->rb_node;
119 struct btrfs_free_space *entry, *ret = NULL;
120
121 while (n) {
122 entry = rb_entry(n, struct btrfs_free_space, bytes_index);
123
124 if (bytes < entry->bytes) {
125 /*
126 * We prefer to get a hole size as close to the size we
127 * are asking for so we don't take small slivers out of
128 * huge holes, but we also want to get as close to the
129 * offset as possible so we don't have a whole lot of
130 * fragmentation.
131 */
132 if (offset <= entry->offset) {
133 if (!ret)
134 ret = entry;
135 else if (entry->bytes < ret->bytes)
136 ret = entry;
137 else if (entry->offset < ret->offset)
138 ret = entry;
139 }
140 n = n->rb_left;
141 } else if (bytes > entry->bytes) {
142 n = n->rb_right;
143 } else {
144 /*
145 * Ok we may have multiple chunks of the wanted size,
146 * so we don't want to take the first one we find, we
147 * want to take the one closest to our given offset, so
148 * keep searching just in case theres a better match.
149 */
150 n = n->rb_right;
151 if (offset > entry->offset)
152 continue;
153 else if (!ret || entry->offset < ret->offset)
154 ret = entry;
155 }
156 }
157
158 return ret;
159}
160
161static void unlink_free_space(struct btrfs_block_group_cache *block_group,
162 struct btrfs_free_space *info)
163{
164 rb_erase(&info->offset_index, &block_group->free_space_offset);
165 rb_erase(&info->bytes_index, &block_group->free_space_bytes);
166}
167
168static int link_free_space(struct btrfs_block_group_cache *block_group,
169 struct btrfs_free_space *info)
170{
171 int ret = 0;
172
173
174 ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
175 &info->offset_index);
176 if (ret)
177 return ret;
178
179 ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
180 &info->bytes_index);
181 if (ret)
182 return ret;
183
184 return ret;
185}
186
187int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
188 u64 offset, u64 bytes)
189{
190 struct btrfs_free_space *right_info;
191 struct btrfs_free_space *left_info;
192 struct btrfs_free_space *info = NULL;
193 struct btrfs_free_space *alloc_info;
194 int ret = 0;
195
196 alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
197 if (!alloc_info)
198 return -ENOMEM;
199
200 /*
201 * first we want to see if there is free space adjacent to the range we
202 * are adding, if there is remove that struct and add a new one to
203 * cover the entire range
204 */
205 spin_lock(&block_group->lock);
206
207 right_info = tree_search_offset(&block_group->free_space_offset,
208 offset+bytes, 0, 1);
209 left_info = tree_search_offset(&block_group->free_space_offset,
210 offset-1, 0, 1);
211
212 if (right_info && right_info->offset == offset+bytes) {
213 unlink_free_space(block_group, right_info);
214 info = right_info;
215 info->offset = offset;
216 info->bytes += bytes;
217 } else if (right_info && right_info->offset != offset+bytes) {
218 printk(KERN_ERR "adding space in the middle of an existing "
219 "free space area. existing: offset=%Lu, bytes=%Lu. "
220 "new: offset=%Lu, bytes=%Lu\n", right_info->offset,
221 right_info->bytes, offset, bytes);
222 BUG();
223 }
224
225 if (left_info) {
226 unlink_free_space(block_group, left_info);
227
228 if (unlikely((left_info->offset + left_info->bytes) !=
229 offset)) {
230 printk(KERN_ERR "free space to the left of new free "
231 "space isn't quite right. existing: offset=%Lu,"
232 " bytes=%Lu. new: offset=%Lu, bytes=%Lu\n",
233 left_info->offset, left_info->bytes, offset,
234 bytes);
235 BUG();
236 }
237
238 if (info) {
239 info->offset = left_info->offset;
240 info->bytes += left_info->bytes;
241 kfree(left_info);
242 } else {
243 info = left_info;
244 info->bytes += bytes;
245 }
246 }
247
248 if (info) {
249 ret = link_free_space(block_group, info);
250 if (!ret)
251 info = NULL;
252 goto out;
253 }
254
255 info = alloc_info;
256 alloc_info = NULL;
257 info->offset = offset;
258 info->bytes = bytes;
259
260 ret = link_free_space(block_group, info);
261 if (ret)
262 kfree(info);
263out:
264 spin_unlock(&block_group->lock);
265 if (ret) {
266 printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
267 if (ret == -EEXIST)
268 BUG();
269 }
270
271 if (alloc_info)
272 kfree(alloc_info);
273
274 return ret;
275}
276
277int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
278 u64 offset, u64 bytes)
279{
280 struct btrfs_free_space *info;
281 int ret = 0;
282
283 spin_lock(&block_group->lock);
284 info = tree_search_offset(&block_group->free_space_offset, offset, 0,
285 1);
286
287 if (info && info->offset == offset) {
288 if (info->bytes < bytes) {
289 printk(KERN_ERR "Found free space at %Lu, size %Lu,"
290 "trying to use %Lu\n",
291 info->offset, info->bytes, bytes);
292 WARN_ON(1);
293 ret = -EINVAL;
294 goto out;
295 }
296
297 unlink_free_space(block_group, info);
298
299 if (info->bytes == bytes) {
300 kfree(info);
301 goto out;
302 }
303
304 info->offset += bytes;
305 info->bytes -= bytes;
306
307 ret = link_free_space(block_group, info);
308 BUG_ON(ret);
309 } else {
310 WARN_ON(1);
311 }
312out:
313 spin_unlock(&block_group->lock);
314 return ret;
315}
316
317void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
318 u64 bytes)
319{
320 struct btrfs_free_space *info;
321 struct rb_node *n;
322 int count = 0;
323
324 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
325 info = rb_entry(n, struct btrfs_free_space, offset_index);
326 if (info->bytes >= bytes)
327 count++;
328 //printk(KERN_INFO "offset=%Lu, bytes=%Lu\n", info->offset,
329 // info->bytes);
330 }
331 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
332 "\n", count);
333}
334
335u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
336{
337 struct btrfs_free_space *info;
338 struct rb_node *n;
339 u64 ret = 0;
340
341 for (n = rb_first(&block_group->free_space_offset); n;
342 n = rb_next(n)) {
343 info = rb_entry(n, struct btrfs_free_space, offset_index);
344 ret += info->bytes;
345 }
346
347 return ret;
348}
349
350void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
351{
352 struct btrfs_free_space *info;
353 struct rb_node *node;
354
355 spin_lock(&block_group->lock);
356 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
357 info = rb_entry(node, struct btrfs_free_space, bytes_index);
358 unlink_free_space(block_group, info);
359 kfree(info);
360 if (need_resched()) {
361 spin_unlock(&block_group->lock);
362 cond_resched();
363 spin_lock(&block_group->lock);
364 }
365 }
366 spin_unlock(&block_group->lock);
367}
368
369struct btrfs_free_space *btrfs_find_free_space_offset(struct
370 btrfs_block_group_cache
371 *block_group, u64 offset,
372 u64 bytes)
373{
374 struct btrfs_free_space *ret;
375
376 spin_lock(&block_group->lock);
377 ret = tree_search_offset(&block_group->free_space_offset, offset,
378 bytes, 0);
379 spin_unlock(&block_group->lock);
380
381 return ret;
382}
383
384struct btrfs_free_space *btrfs_find_free_space_bytes(struct
385 btrfs_block_group_cache
386 *block_group, u64 offset,
387 u64 bytes)
388{
389 struct btrfs_free_space *ret;
390
391 spin_lock(&block_group->lock);
392
393 ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
394 spin_unlock(&block_group->lock);
395
396 return ret;
397}
398
399struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
400 *block_group, u64 offset,
401 u64 bytes)
402{
403 struct btrfs_free_space *ret;
404
405 spin_lock(&block_group->lock);
406 ret = tree_search_offset(&block_group->free_space_offset, offset,
407 bytes, 0);
408 if (!ret)
409 ret = tree_search_bytes(&block_group->free_space_bytes,
410 offset, bytes);
411
412 spin_unlock(&block_group->lock);
413
414 return ret;
415}