aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
authorJosef Bacik <josef@redhat.com>2009-07-13 21:29:25 -0400
committerChris Mason <chris.mason@oracle.com>2009-07-24 09:23:39 -0400
commit817d52f8dba26d0295c26035531c30ce5f1e3c3e (patch)
tree5230153e86323de48e7e1440352d1b74d2d9961d /fs/btrfs/extent-tree.c
parent963030817060e4f109be1993b9ae8f81dbf5e11a (diff)
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to allow people to allocate sooner. Instead of blocking up behind the caching mutex, we instead kick of the caching kthread, and then attempt to make an allocation. If we cannot, we wait on the block groups caching waitqueue, which the caching kthread will wake the waiting threads up everytime it finds 2 meg worth of space, and then again when its finished caching. This is how I tested the speedup from this mkfs the disk mount the disk fill the disk up with fs_mark unmount the disk mount the disk time touch /mnt/foo Without my changes this took 11 seconds on my box, with these changes it now takes 1 second. Another change thats been put in place is we lock the super mirror's in the pinned extent map in order to keep us from adding that stuff as free space when caching the block group. This doesn't really change anything else as far as the pinned extent map is concerned, since for actual pinned extents we use EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock those extents to keep from leaking memory. I've also added a check where when we are reading block groups from disk, if the amount of space used == the size of the block group, we go ahead and mark the block group as cached. This drastically reduces the amount of time it takes to cache the block groups. Using the same test as above, except doing a dd to a file and then unmounting, it used to take 33 seconds to umount, now it takes 3 seconds. This version uses the commit_root in the caching kthread, and then keeps track of how many async caching threads are running at any given time so if one of the async threads is still running as we cross transactions we can wait until its finished before handling the pinned extents. Thank you, Signed-off-by: Josef Bacik <jbacik@redhat.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c471
1 files changed, 380 insertions, 91 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 98697be6bdde..9a489cc89fd3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -21,6 +21,7 @@
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h>
24#include "compat.h" 25#include "compat.h"
25#include "hash.h" 26#include "hash.h"
26#include "ctree.h" 27#include "ctree.h"
@@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
61 struct btrfs_root *extent_root, u64 alloc_bytes, 62 struct btrfs_root *extent_root, u64 alloc_bytes,
62 u64 flags, int force); 63 u64 flags, int force);
63 64
65static noinline int
66block_group_cache_done(struct btrfs_block_group_cache *cache)
67{
68 smp_mb();
69 return cache->cached == BTRFS_CACHE_FINISHED;
70}
71
64static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 72static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
65{ 73{
66 return (cache->flags & bits) == bits; 74 return (cache->flags & bits) == bits;
@@ -145,21 +153,64 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
145 return ret; 153 return ret;
146} 154}
147 155
156void btrfs_free_super_mirror_extents(struct btrfs_fs_info *info)
157{
158 u64 start, end, last = 0;
159 int ret;
160
161 while (1) {
162 ret = find_first_extent_bit(&info->pinned_extents, last,
163 &start, &end, EXTENT_LOCKED);
164 if (ret)
165 break;
166
167 unlock_extent(&info->pinned_extents, start, end, GFP_NOFS);
168 last = end+1;
169 }
170}
171
172static int remove_sb_from_cache(struct btrfs_root *root,
173 struct btrfs_block_group_cache *cache)
174{
175 struct btrfs_fs_info *fs_info = root->fs_info;
176 u64 bytenr;
177 u64 *logical;
178 int stripe_len;
179 int i, nr, ret;
180
181 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
182 bytenr = btrfs_sb_offset(i);
183 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
184 cache->key.objectid, bytenr,
185 0, &logical, &nr, &stripe_len);
186 BUG_ON(ret);
187 while (nr--) {
188 try_lock_extent(&fs_info->pinned_extents,
189 logical[nr],
190 logical[nr] + stripe_len - 1, GFP_NOFS);
191 }
192 kfree(logical);
193 }
194
195 return 0;
196}
197
148/* 198/*
149 * this is only called by cache_block_group, since we could have freed extents 199 * this is only called by cache_block_group, since we could have freed extents
150 * we need to check the pinned_extents for any extents that can't be used yet 200 * we need to check the pinned_extents for any extents that can't be used yet
151 * since their free space will be released as soon as the transaction commits. 201 * since their free space will be released as soon as the transaction commits.
152 */ 202 */
153static int add_new_free_space(struct btrfs_block_group_cache *block_group, 203static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
154 struct btrfs_fs_info *info, u64 start, u64 end) 204 struct btrfs_fs_info *info, u64 start, u64 end)
155{ 205{
156 u64 extent_start, extent_end, size; 206 u64 extent_start, extent_end, size, total_added = 0;
157 int ret; 207 int ret;
158 208
159 while (start < end) { 209 while (start < end) {
160 ret = find_first_extent_bit(&info->pinned_extents, start, 210 ret = find_first_extent_bit(&info->pinned_extents, start,
161 &extent_start, &extent_end, 211 &extent_start, &extent_end,
162 EXTENT_DIRTY); 212 EXTENT_DIRTY|EXTENT_LOCKED|
213 EXTENT_DELALLOC);
163 if (ret) 214 if (ret)
164 break; 215 break;
165 216
@@ -167,6 +218,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
167 start = extent_end + 1; 218 start = extent_end + 1;
168 } else if (extent_start > start && extent_start < end) { 219 } else if (extent_start > start && extent_start < end) {
169 size = extent_start - start; 220 size = extent_start - start;
221 total_added += size;
170 ret = btrfs_add_free_space(block_group, start, 222 ret = btrfs_add_free_space(block_group, start,
171 size); 223 size);
172 BUG_ON(ret); 224 BUG_ON(ret);
@@ -178,84 +230,139 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
178 230
179 if (start < end) { 231 if (start < end) {
180 size = end - start; 232 size = end - start;
233 total_added += size;
181 ret = btrfs_add_free_space(block_group, start, size); 234 ret = btrfs_add_free_space(block_group, start, size);
182 BUG_ON(ret); 235 BUG_ON(ret);
183 } 236 }
184 237
185 return 0; 238 return total_added;
186} 239}
187 240
188static int remove_sb_from_cache(struct btrfs_root *root, 241DEFINE_MUTEX(discard_mutex);
189 struct btrfs_block_group_cache *cache) 242
243/*
244 * if async kthreads are running when we cross transactions, we mark any pinned
245 * extents with EXTENT_DELALLOC and then let the caching kthreads clean up those
246 * extents when they are done. Also we run this from btrfs_finish_extent_commit
247 * in case there were some pinned extents that were missed because we had
248 * already cached that block group.
249 */
250static void btrfs_discard_pinned_extents(struct btrfs_fs_info *fs_info,
251 struct btrfs_block_group_cache *cache)
190{ 252{
191 u64 bytenr; 253 u64 start, end, last;
192 u64 *logical; 254 int ret;
193 int stripe_len;
194 int i, nr, ret;
195 255
196 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 256 if (!cache)
197 bytenr = btrfs_sb_offset(i); 257 last = 0;
198 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 258 else
199 cache->key.objectid, bytenr, 0, 259 last = cache->key.objectid;
200 &logical, &nr, &stripe_len); 260
201 BUG_ON(ret); 261 mutex_lock(&discard_mutex);
202 while (nr--) { 262 while (1) {
203 btrfs_remove_free_space(cache, logical[nr], 263 ret = find_first_extent_bit(&fs_info->pinned_extents, last,
204 stripe_len); 264 &start, &end, EXTENT_DELALLOC);
265 if (ret)
266 break;
267
268 if (cache && start >= cache->key.objectid + cache->key.offset)
269 break;
270
271
272 if (!cache) {
273 cache = btrfs_lookup_block_group(fs_info, start);
274 BUG_ON(!cache);
275
276 start = max(start, cache->key.objectid);
277 end = min(end, cache->key.objectid + cache->key.offset - 1);
278
279 if (block_group_cache_done(cache))
280 btrfs_add_free_space(cache, start,
281 end - start + 1);
282 cache = NULL;
283 } else {
284 start = max(start, cache->key.objectid);
285 end = min(end, cache->key.objectid + cache->key.offset - 1);
286 btrfs_add_free_space(cache, start, end - start + 1);
287 }
288
289 clear_extent_bits(&fs_info->pinned_extents, start, end,
290 EXTENT_DELALLOC, GFP_NOFS);
291 last = end + 1;
292
293 if (need_resched()) {
294 mutex_unlock(&discard_mutex);
295 cond_resched();
296 mutex_lock(&discard_mutex);
205 } 297 }
206 kfree(logical);
207 } 298 }
208 return 0; 299 mutex_unlock(&discard_mutex);
209} 300}
210 301
211static int cache_block_group(struct btrfs_root *root, 302static int caching_kthread(void *data)
212 struct btrfs_block_group_cache *block_group)
213{ 303{
304 struct btrfs_block_group_cache *block_group = data;
305 struct btrfs_fs_info *fs_info = block_group->fs_info;
306 u64 last = 0;
214 struct btrfs_path *path; 307 struct btrfs_path *path;
215 int ret = 0; 308 int ret = 0;
216 struct btrfs_key key; 309 struct btrfs_key key;
217 struct extent_buffer *leaf; 310 struct extent_buffer *leaf;
218 int slot; 311 int slot;
219 u64 last; 312 u64 total_found = 0;
220
221 if (!block_group)
222 return 0;
223 313
224 root = root->fs_info->extent_root; 314 BUG_ON(!fs_info);
225
226 if (block_group->cached)
227 return 0;
228 315
229 path = btrfs_alloc_path(); 316 path = btrfs_alloc_path();
230 if (!path) 317 if (!path)
231 return -ENOMEM; 318 return -ENOMEM;
232 319
233 path->reada = 2; 320 atomic_inc(&fs_info->async_caching_threads);
321 atomic_inc(&block_group->space_info->caching_threads);
322 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
323again:
324 /* need to make sure the commit_root doesn't disappear */
325 down_read(&fs_info->extent_root->commit_root_sem);
326
234 /* 327 /*
235 * we get into deadlocks with paths held by callers of this function. 328 * We don't want to deadlock with somebody trying to allocate a new
236 * since the alloc_mutex is protecting things right now, just 329 * extent for the extent root while also trying to search the extent
237 * skip the locking here 330 * root to add free space. So we skip locking and search the commit
331 * root, since its read-only
238 */ 332 */
239 path->skip_locking = 1; 333 path->skip_locking = 1;
240 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 334 path->search_commit_root = 1;
335 path->reada = 2;
336
241 key.objectid = last; 337 key.objectid = last;
242 key.offset = 0; 338 key.offset = 0;
243 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 339 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
244 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 340 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
245 if (ret < 0) 341 if (ret < 0)
246 goto err; 342 goto err;
247 343
248 while (1) { 344 while (1) {
345 smp_mb();
346 if (block_group->fs_info->closing)
347 break;
348
249 leaf = path->nodes[0]; 349 leaf = path->nodes[0];
250 slot = path->slots[0]; 350 slot = path->slots[0];
251 if (slot >= btrfs_header_nritems(leaf)) { 351 if (slot >= btrfs_header_nritems(leaf)) {
252 ret = btrfs_next_leaf(root, path); 352 ret = btrfs_next_leaf(fs_info->extent_root, path);
253 if (ret < 0) 353 if (ret < 0)
254 goto err; 354 goto err;
255 if (ret == 0) 355 else if (ret)
256 continue;
257 else
258 break; 356 break;
357
358 if (need_resched()) {
359 btrfs_release_path(fs_info->extent_root, path);
360 up_read(&fs_info->extent_root->commit_root_sem);
361 cond_resched();
362 goto again;
363 }
364
365 continue;
259 } 366 }
260 btrfs_item_key_to_cpu(leaf, &key, slot); 367 btrfs_item_key_to_cpu(leaf, &key, slot);
261 if (key.objectid < block_group->key.objectid) 368 if (key.objectid < block_group->key.objectid)
@@ -266,24 +373,63 @@ static int cache_block_group(struct btrfs_root *root,
266 break; 373 break;
267 374
268 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { 375 if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
269 add_new_free_space(block_group, root->fs_info, last, 376 total_found += add_new_free_space(block_group,
270 key.objectid); 377 fs_info, last,
271 378 key.objectid);
272 last = key.objectid + key.offset; 379 last = key.objectid + key.offset;
273 } 380 }
381
382 if (total_found > (1024 * 1024 * 2)) {
383 total_found = 0;
384 wake_up(&block_group->caching_q);
385 }
274next: 386next:
275 path->slots[0]++; 387 path->slots[0]++;
276 } 388 }
389 ret = 0;
277 390
278 add_new_free_space(block_group, root->fs_info, last, 391 total_found += add_new_free_space(block_group, fs_info, last,
279 block_group->key.objectid + 392 block_group->key.objectid +
280 block_group->key.offset); 393 block_group->key.offset);
394
395 spin_lock(&block_group->lock);
396 block_group->cached = BTRFS_CACHE_FINISHED;
397 spin_unlock(&block_group->lock);
281 398
282 block_group->cached = 1;
283 remove_sb_from_cache(root, block_group);
284 ret = 0;
285err: 399err:
286 btrfs_free_path(path); 400 btrfs_free_path(path);
401 up_read(&fs_info->extent_root->commit_root_sem);
402 atomic_dec(&fs_info->async_caching_threads);
403 atomic_dec(&block_group->space_info->caching_threads);
404 wake_up(&block_group->caching_q);
405
406 if (!ret)
407 btrfs_discard_pinned_extents(fs_info, block_group);
408
409 return 0;
410}
411
412static int cache_block_group(struct btrfs_block_group_cache *cache)
413{
414 struct task_struct *tsk;
415 int ret = 0;
416
417 spin_lock(&cache->lock);
418 if (cache->cached != BTRFS_CACHE_NO) {
419 spin_unlock(&cache->lock);
420 return ret;
421 }
422 cache->cached = BTRFS_CACHE_STARTED;
423 spin_unlock(&cache->lock);
424
425 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
426 cache->key.objectid);
427 if (IS_ERR(tsk)) {
428 ret = PTR_ERR(tsk);
429 printk(KERN_ERR "error running thread %d\n", ret);
430 BUG();
431 }
432
287 return ret; 433 return ret;
288} 434}
289 435
@@ -1721,7 +1867,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1721 BUG_ON(ret); 1867 BUG_ON(ret);
1722 } 1868 }
1723 btrfs_update_pinned_extents(root, node->bytenr, 1869 btrfs_update_pinned_extents(root, node->bytenr,
1724 node->num_bytes, 1); 1870 node->num_bytes, 1, 0);
1725 update_reserved_extents(root, node->bytenr, 1871 update_reserved_extents(root, node->bytenr,
1726 node->num_bytes, 0); 1872 node->num_bytes, 0);
1727 } 1873 }
@@ -2496,6 +2642,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2496 found->force_alloc = 0; 2642 found->force_alloc = 0;
2497 *space_info = found; 2643 *space_info = found;
2498 list_add_rcu(&found->list, &info->space_info); 2644 list_add_rcu(&found->list, &info->space_info);
2645 atomic_set(&found->caching_threads, 0);
2499 return 0; 2646 return 0;
2500} 2647}
2501 2648
@@ -2953,7 +3100,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2953} 3100}
2954 3101
2955int btrfs_update_pinned_extents(struct btrfs_root *root, 3102int btrfs_update_pinned_extents(struct btrfs_root *root,
2956 u64 bytenr, u64 num, int pin) 3103 u64 bytenr, u64 num, int pin, int mark_free)
2957{ 3104{
2958 u64 len; 3105 u64 len;
2959 struct btrfs_block_group_cache *cache; 3106 struct btrfs_block_group_cache *cache;
@@ -2988,7 +3135,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
2988 spin_unlock(&cache->lock); 3135 spin_unlock(&cache->lock);
2989 spin_unlock(&cache->space_info->lock); 3136 spin_unlock(&cache->space_info->lock);
2990 fs_info->total_pinned -= len; 3137 fs_info->total_pinned -= len;
2991 if (cache->cached) 3138 if (block_group_cache_done(cache) && mark_free)
2992 btrfs_add_free_space(cache, bytenr, len); 3139 btrfs_add_free_space(cache, bytenr, len);
2993 } 3140 }
2994 btrfs_put_block_group(cache); 3141 btrfs_put_block_group(cache);
@@ -3034,14 +3181,27 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
3034 u64 last = 0; 3181 u64 last = 0;
3035 u64 start; 3182 u64 start;
3036 u64 end; 3183 u64 end;
3184 bool caching_kthreads = false;
3037 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; 3185 struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
3038 int ret; 3186 int ret;
3039 3187
3188 if (atomic_read(&root->fs_info->async_caching_threads))
3189 caching_kthreads = true;
3190
3040 while (1) { 3191 while (1) {
3041 ret = find_first_extent_bit(pinned_extents, last, 3192 ret = find_first_extent_bit(pinned_extents, last,
3042 &start, &end, EXTENT_DIRTY); 3193 &start, &end, EXTENT_DIRTY);
3043 if (ret) 3194 if (ret)
3044 break; 3195 break;
3196
3197 /*
3198 * we need to make sure that the pinned extents don't go away
3199 * while we are caching block groups
3200 */
3201 if (unlikely(caching_kthreads))
3202 set_extent_delalloc(pinned_extents, start, end,
3203 GFP_NOFS);
3204
3045 set_extent_dirty(copy, start, end, GFP_NOFS); 3205 set_extent_dirty(copy, start, end, GFP_NOFS);
3046 last = end + 1; 3206 last = end + 1;
3047 } 3207 }
@@ -3055,6 +3215,12 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3055 u64 start; 3215 u64 start;
3056 u64 end; 3216 u64 end;
3057 int ret; 3217 int ret;
3218 int mark_free = 1;
3219
3220 ret = find_first_extent_bit(&root->fs_info->pinned_extents, 0,
3221 &start, &end, EXTENT_DELALLOC);
3222 if (!ret)
3223 mark_free = 0;
3058 3224
3059 while (1) { 3225 while (1) {
3060 ret = find_first_extent_bit(unpin, 0, &start, &end, 3226 ret = find_first_extent_bit(unpin, 0, &start, &end,
@@ -3065,11 +3231,16 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3065 ret = btrfs_discard_extent(root, start, end + 1 - start); 3231 ret = btrfs_discard_extent(root, start, end + 1 - start);
3066 3232
3067 /* unlocks the pinned mutex */ 3233 /* unlocks the pinned mutex */
3068 btrfs_update_pinned_extents(root, start, end + 1 - start, 0); 3234 btrfs_update_pinned_extents(root, start, end + 1 - start, 0,
3235 mark_free);
3069 clear_extent_dirty(unpin, start, end, GFP_NOFS); 3236 clear_extent_dirty(unpin, start, end, GFP_NOFS);
3070 3237
3071 cond_resched(); 3238 cond_resched();
3072 } 3239 }
3240
3241 if (unlikely(!mark_free))
3242 btrfs_discard_pinned_extents(root->fs_info, NULL);
3243
3073 return ret; 3244 return ret;
3074} 3245}
3075 3246
@@ -3110,7 +3281,7 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
3110pinit: 3281pinit:
3111 btrfs_set_path_blocking(path); 3282 btrfs_set_path_blocking(path);
3112 /* unlocks the pinned mutex */ 3283 /* unlocks the pinned mutex */
3113 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 3284 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1, 0);
3114 3285
3115 BUG_ON(err < 0); 3286 BUG_ON(err < 0);
3116 return 0; 3287 return 0;
@@ -3421,7 +3592,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
3421 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 3592 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
3422 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 3593 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
3423 /* unlocks the pinned mutex */ 3594 /* unlocks the pinned mutex */
3424 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); 3595 btrfs_update_pinned_extents(root, bytenr, num_bytes, 1, 0);
3425 update_reserved_extents(root, bytenr, num_bytes, 0); 3596 update_reserved_extents(root, bytenr, num_bytes, 0);
3426 ret = 0; 3597 ret = 0;
3427 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 3598 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
@@ -3448,6 +3619,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
3448} 3619}
3449 3620
3450/* 3621/*
3622 * when we wait for progress in the block group caching, its because
3623 * our allocation attempt failed at least once. So, we must sleep
3624 * and let some progress happen before we try again.
3625 *
3626 * This function will sleep at least once waiting for new free space to
3627 * show up, and then it will check the block group free space numbers
3628 * for our min num_bytes. Another option is to have it go ahead
3629 * and look in the rbtree for a free extent of a given size, but this
3630 * is a good start.
3631 */
3632static noinline int
3633wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
3634 u64 num_bytes)
3635{
3636 DEFINE_WAIT(wait);
3637
3638 prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
3639
3640 if (block_group_cache_done(cache)) {
3641 finish_wait(&cache->caching_q, &wait);
3642 return 0;
3643 }
3644 schedule();
3645 finish_wait(&cache->caching_q, &wait);
3646
3647 wait_event(cache->caching_q, block_group_cache_done(cache) ||
3648 (cache->free_space >= num_bytes));
3649 return 0;
3650}
3651
3652enum btrfs_loop_type {
3653 LOOP_CACHED_ONLY = 0,
3654 LOOP_CACHING_NOWAIT = 1,
3655 LOOP_CACHING_WAIT = 2,
3656 LOOP_ALLOC_CHUNK = 3,
3657 LOOP_NO_EMPTY_SIZE = 4,
3658};
3659
3660/*
3451 * walks the btree of allocated extents and find a hole of a given size. 3661 * walks the btree of allocated extents and find a hole of a given size.
3452 * The key ins is changed to record the hole: 3662 * The key ins is changed to record the hole:
3453 * ins->objectid == block start 3663 * ins->objectid == block start
@@ -3472,6 +3682,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3472 struct btrfs_space_info *space_info; 3682 struct btrfs_space_info *space_info;
3473 int last_ptr_loop = 0; 3683 int last_ptr_loop = 0;
3474 int loop = 0; 3684 int loop = 0;
3685 bool found_uncached_bg = false;
3475 3686
3476 WARN_ON(num_bytes < root->sectorsize); 3687 WARN_ON(num_bytes < root->sectorsize);
3477 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 3688 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3503,15 +3714,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3503 search_start = max(search_start, first_logical_byte(root, 0)); 3714 search_start = max(search_start, first_logical_byte(root, 0));
3504 search_start = max(search_start, hint_byte); 3715 search_start = max(search_start, hint_byte);
3505 3716
3506 if (!last_ptr) { 3717 if (!last_ptr)
3507 empty_cluster = 0; 3718 empty_cluster = 0;
3508 loop = 1;
3509 }
3510 3719
3511 if (search_start == hint_byte) { 3720 if (search_start == hint_byte) {
3512 block_group = btrfs_lookup_block_group(root->fs_info, 3721 block_group = btrfs_lookup_block_group(root->fs_info,
3513 search_start); 3722 search_start);
3514 if (block_group && block_group_bits(block_group, data)) { 3723 /*
3724 * we don't want to use the block group if it doesn't match our
3725 * allocation bits, or if its not cached.
3726 */
3727 if (block_group && block_group_bits(block_group, data) &&
3728 block_group_cache_done(block_group)) {
3515 down_read(&space_info->groups_sem); 3729 down_read(&space_info->groups_sem);
3516 if (list_empty(&block_group->list) || 3730 if (list_empty(&block_group->list) ||
3517 block_group->ro) { 3731 block_group->ro) {
@@ -3534,21 +3748,35 @@ search:
3534 down_read(&space_info->groups_sem); 3748 down_read(&space_info->groups_sem);
3535 list_for_each_entry(block_group, &space_info->block_groups, list) { 3749 list_for_each_entry(block_group, &space_info->block_groups, list) {
3536 u64 offset; 3750 u64 offset;
3751 int cached;
3537 3752
3538 atomic_inc(&block_group->count); 3753 atomic_inc(&block_group->count);
3539 search_start = block_group->key.objectid; 3754 search_start = block_group->key.objectid;
3540 3755
3541have_block_group: 3756have_block_group:
3542 if (unlikely(!block_group->cached)) { 3757 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
3543 mutex_lock(&block_group->cache_mutex); 3758 /*
3544 ret = cache_block_group(root, block_group); 3759 * we want to start caching kthreads, but not too many
3545 mutex_unlock(&block_group->cache_mutex); 3760 * right off the bat so we don't overwhelm the system,
3546 if (ret) { 3761 * so only start them if there are less than 2 and we're
3547 btrfs_put_block_group(block_group); 3762 * in the initial allocation phase.
3548 break; 3763 */
3764 if (loop > LOOP_CACHING_NOWAIT ||
3765 atomic_read(&space_info->caching_threads) < 2) {
3766 ret = cache_block_group(block_group);
3767 BUG_ON(ret);
3549 } 3768 }
3550 } 3769 }
3551 3770
3771 cached = block_group_cache_done(block_group);
3772 if (unlikely(!cached)) {
3773 found_uncached_bg = true;
3774
3775 /* if we only want cached bgs, loop */
3776 if (loop == LOOP_CACHED_ONLY)
3777 goto loop;
3778 }
3779
3552 if (unlikely(block_group->ro)) 3780 if (unlikely(block_group->ro))
3553 goto loop; 3781 goto loop;
3554 3782
@@ -3627,14 +3855,21 @@ refill_cluster:
3627 spin_unlock(&last_ptr->refill_lock); 3855 spin_unlock(&last_ptr->refill_lock);
3628 goto checks; 3856 goto checks;
3629 } 3857 }
3858 } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
3859 spin_unlock(&last_ptr->refill_lock);
3860
3861 wait_block_group_cache_progress(block_group,
3862 num_bytes + empty_cluster + empty_size);
3863 goto have_block_group;
3630 } 3864 }
3865
3631 /* 3866 /*
3632 * at this point we either didn't find a cluster 3867 * at this point we either didn't find a cluster
3633 * or we weren't able to allocate a block from our 3868 * or we weren't able to allocate a block from our
3634 * cluster. Free the cluster we've been trying 3869 * cluster. Free the cluster we've been trying
3635 * to use, and go to the next block group 3870 * to use, and go to the next block group
3636 */ 3871 */
3637 if (loop < 2) { 3872 if (loop < LOOP_NO_EMPTY_SIZE) {
3638 btrfs_return_cluster_to_free_space(NULL, 3873 btrfs_return_cluster_to_free_space(NULL,
3639 last_ptr); 3874 last_ptr);
3640 spin_unlock(&last_ptr->refill_lock); 3875 spin_unlock(&last_ptr->refill_lock);
@@ -3645,8 +3880,15 @@ refill_cluster:
3645 3880
3646 offset = btrfs_find_space_for_alloc(block_group, search_start, 3881 offset = btrfs_find_space_for_alloc(block_group, search_start,
3647 num_bytes, empty_size); 3882 num_bytes, empty_size);
3648 if (!offset) 3883 if (!offset && (cached || (!cached &&
3884 loop == LOOP_CACHING_NOWAIT))) {
3649 goto loop; 3885 goto loop;
3886 } else if (!offset && (!cached &&
3887 loop > LOOP_CACHING_NOWAIT)) {
3888 wait_block_group_cache_progress(block_group,
3889 num_bytes + empty_size);
3890 goto have_block_group;
3891 }
3650checks: 3892checks:
3651 search_start = stripe_align(root, offset); 3893 search_start = stripe_align(root, offset);
3652 /* move on to the next group */ 3894 /* move on to the next group */
@@ -3694,13 +3936,26 @@ loop:
3694 } 3936 }
3695 up_read(&space_info->groups_sem); 3937 up_read(&space_info->groups_sem);
3696 3938
3697 /* loop == 0, try to find a clustered alloc in every block group 3939 /* LOOP_CACHED_ONLY, only search fully cached block groups
3698 * loop == 1, try again after forcing a chunk allocation 3940 * LOOP_CACHING_NOWAIT, search partially cached block groups, but
3699 * loop == 2, set empty_size and empty_cluster to 0 and try again 3941 * dont wait foR them to finish caching
3942 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3943 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3944 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
3945 * again
3700 */ 3946 */
3701 if (!ins->objectid && loop < 3 && 3947 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
3702 (empty_size || empty_cluster || allowed_chunk_alloc)) { 3948 (found_uncached_bg || empty_size || empty_cluster ||
3703 if (loop >= 2) { 3949 allowed_chunk_alloc)) {
3950 if (found_uncached_bg) {
3951 found_uncached_bg = false;
3952 if (loop < LOOP_CACHING_WAIT) {
3953 loop++;
3954 goto search;
3955 }
3956 }
3957
3958 if (loop == LOOP_ALLOC_CHUNK) {
3704 empty_size = 0; 3959 empty_size = 0;
3705 empty_cluster = 0; 3960 empty_cluster = 0;
3706 } 3961 }
@@ -3713,7 +3968,7 @@ loop:
3713 space_info->force_alloc = 1; 3968 space_info->force_alloc = 1;
3714 } 3969 }
3715 3970
3716 if (loop < 3) { 3971 if (loop < LOOP_NO_EMPTY_SIZE) {
3717 loop++; 3972 loop++;
3718 goto search; 3973 goto search;
3719 } 3974 }
@@ -3809,7 +4064,7 @@ again:
3809 num_bytes, data, 1); 4064 num_bytes, data, 1);
3810 goto again; 4065 goto again;
3811 } 4066 }
3812 if (ret) { 4067 if (ret == -ENOSPC) {
3813 struct btrfs_space_info *sinfo; 4068 struct btrfs_space_info *sinfo;
3814 4069
3815 sinfo = __find_space_info(root->fs_info, data); 4070 sinfo = __find_space_info(root->fs_info, data);
@@ -3817,7 +4072,6 @@ again:
3817 "wanted %llu\n", (unsigned long long)data, 4072 "wanted %llu\n", (unsigned long long)data,
3818 (unsigned long long)num_bytes); 4073 (unsigned long long)num_bytes);
3819 dump_space_info(sinfo, num_bytes); 4074 dump_space_info(sinfo, num_bytes);
3820 BUG();
3821 } 4075 }
3822 4076
3823 return ret; 4077 return ret;
@@ -3855,7 +4109,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3855 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, 4109 ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3856 empty_size, hint_byte, search_end, ins, 4110 empty_size, hint_byte, search_end, ins,
3857 data); 4111 data);
3858 update_reserved_extents(root, ins->objectid, ins->offset, 1); 4112 if (!ret)
4113 update_reserved_extents(root, ins->objectid, ins->offset, 1);
4114
3859 return ret; 4115 return ret;
3860} 4116}
3861 4117
@@ -4017,9 +4273,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4017 struct btrfs_block_group_cache *block_group; 4273 struct btrfs_block_group_cache *block_group;
4018 4274
4019 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 4275 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
4020 mutex_lock(&block_group->cache_mutex); 4276 cache_block_group(block_group);
4021 cache_block_group(root, block_group); 4277 wait_event(block_group->caching_q,
4022 mutex_unlock(&block_group->cache_mutex); 4278 block_group_cache_done(block_group));
4023 4279
4024 ret = btrfs_remove_free_space(block_group, ins->objectid, 4280 ret = btrfs_remove_free_space(block_group, ins->objectid,
4025 ins->offset); 4281 ins->offset);
@@ -4050,7 +4306,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
4050 ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, 4306 ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4051 empty_size, hint_byte, search_end, 4307 empty_size, hint_byte, search_end,
4052 ins, 0); 4308 ins, 0);
4053 BUG_ON(ret); 4309 if (ret)
4310 return ret;
4054 4311
4055 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 4312 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4056 if (parent == 0) 4313 if (parent == 0)
@@ -6966,11 +7223,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6966 &info->block_group_cache_tree); 7223 &info->block_group_cache_tree);
6967 spin_unlock(&info->block_group_cache_lock); 7224 spin_unlock(&info->block_group_cache_lock);
6968 7225
6969 btrfs_remove_free_space_cache(block_group);
6970 down_write(&block_group->space_info->groups_sem); 7226 down_write(&block_group->space_info->groups_sem);
6971 list_del(&block_group->list); 7227 list_del(&block_group->list);
6972 up_write(&block_group->space_info->groups_sem); 7228 up_write(&block_group->space_info->groups_sem);
6973 7229
7230 if (block_group->cached == BTRFS_CACHE_STARTED)
7231 wait_event(block_group->caching_q,
7232 block_group_cache_done(block_group));
7233
7234 btrfs_remove_free_space_cache(block_group);
7235
6974 WARN_ON(atomic_read(&block_group->count) != 1); 7236 WARN_ON(atomic_read(&block_group->count) != 1);
6975 kfree(block_group); 7237 kfree(block_group);
6976 7238
@@ -7036,10 +7298,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7036 atomic_set(&cache->count, 1); 7298 atomic_set(&cache->count, 1);
7037 spin_lock_init(&cache->lock); 7299 spin_lock_init(&cache->lock);
7038 spin_lock_init(&cache->tree_lock); 7300 spin_lock_init(&cache->tree_lock);
7039 mutex_init(&cache->cache_mutex); 7301 cache->fs_info = info;
7302 init_waitqueue_head(&cache->caching_q);
7040 INIT_LIST_HEAD(&cache->list); 7303 INIT_LIST_HEAD(&cache->list);
7041 INIT_LIST_HEAD(&cache->cluster_list); 7304 INIT_LIST_HEAD(&cache->cluster_list);
7042 cache->sectorsize = root->sectorsize;
7043 7305
7044 /* 7306 /*
7045 * we only want to have 32k of ram per block group for keeping 7307 * we only want to have 32k of ram per block group for keeping
@@ -7057,6 +7319,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7057 key.objectid = found_key.objectid + found_key.offset; 7319 key.objectid = found_key.objectid + found_key.offset;
7058 btrfs_release_path(root, path); 7320 btrfs_release_path(root, path);
7059 cache->flags = btrfs_block_group_flags(&cache->item); 7321 cache->flags = btrfs_block_group_flags(&cache->item);
7322 cache->sectorsize = root->sectorsize;
7323
7324 remove_sb_from_cache(root, cache);
7325
7326 /*
7327 * check for two cases, either we are full, and therefore
7328 * don't need to bother with the caching work since we won't
7329 * find any space, or we are empty, and we can just add all
7330 * the space in and be done with it. This saves us _alot_ of
7331 * time, particularly in the full case.
7332 */
7333 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7334 cache->cached = BTRFS_CACHE_FINISHED;
7335 } else if (btrfs_block_group_used(&cache->item) == 0) {
7336 cache->cached = BTRFS_CACHE_FINISHED;
7337 add_new_free_space(cache, root->fs_info,
7338 found_key.objectid,
7339 found_key.objectid +
7340 found_key.offset);
7341 }
7060 7342
7061 ret = update_space_info(info, cache->flags, found_key.offset, 7343 ret = update_space_info(info, cache->flags, found_key.offset,
7062 btrfs_block_group_used(&cache->item), 7344 btrfs_block_group_used(&cache->item),
@@ -7112,7 +7394,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7112 atomic_set(&cache->count, 1); 7394 atomic_set(&cache->count, 1);
7113 spin_lock_init(&cache->lock); 7395 spin_lock_init(&cache->lock);
7114 spin_lock_init(&cache->tree_lock); 7396 spin_lock_init(&cache->tree_lock);
7115 mutex_init(&cache->cache_mutex); 7397 init_waitqueue_head(&cache->caching_q);
7116 INIT_LIST_HEAD(&cache->list); 7398 INIT_LIST_HEAD(&cache->list);
7117 INIT_LIST_HEAD(&cache->cluster_list); 7399 INIT_LIST_HEAD(&cache->cluster_list);
7118 7400
@@ -7121,11 +7403,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7121 cache->flags = type; 7403 cache->flags = type;
7122 btrfs_set_block_group_flags(&cache->item, type); 7404 btrfs_set_block_group_flags(&cache->item, type);
7123 7405
7124 cache->cached = 1; 7406 cache->cached = BTRFS_CACHE_FINISHED;
7125 ret = btrfs_add_free_space(cache, chunk_offset, size);
7126 BUG_ON(ret);
7127 remove_sb_from_cache(root, cache); 7407 remove_sb_from_cache(root, cache);
7128 7408
7409 add_new_free_space(cache, root->fs_info, chunk_offset,
7410 chunk_offset + size);
7411
7129 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7412 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7130 &cache->space_info); 7413 &cache->space_info);
7131 BUG_ON(ret); 7414 BUG_ON(ret);
@@ -7184,7 +7467,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7184 rb_erase(&block_group->cache_node, 7467 rb_erase(&block_group->cache_node,
7185 &root->fs_info->block_group_cache_tree); 7468 &root->fs_info->block_group_cache_tree);
7186 spin_unlock(&root->fs_info->block_group_cache_lock); 7469 spin_unlock(&root->fs_info->block_group_cache_lock);
7187 btrfs_remove_free_space_cache(block_group); 7470
7188 down_write(&block_group->space_info->groups_sem); 7471 down_write(&block_group->space_info->groups_sem);
7189 /* 7472 /*
7190 * we must use list_del_init so people can check to see if they 7473 * we must use list_del_init so people can check to see if they
@@ -7193,6 +7476,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7193 list_del_init(&block_group->list); 7476 list_del_init(&block_group->list);
7194 up_write(&block_group->space_info->groups_sem); 7477 up_write(&block_group->space_info->groups_sem);
7195 7478
7479 if (block_group->cached == BTRFS_CACHE_STARTED)
7480 wait_event(block_group->caching_q,
7481 block_group_cache_done(block_group));
7482
7483 btrfs_remove_free_space_cache(block_group);
7484
7196 spin_lock(&block_group->space_info->lock); 7485 spin_lock(&block_group->space_info->lock);
7197 block_group->space_info->total_bytes -= block_group->key.offset; 7486 block_group->space_info->total_bytes -= block_group->key.offset;
7198 block_group->space_info->bytes_readonly -= block_group->key.offset; 7487 block_group->space_info->bytes_readonly -= block_group->key.offset;