aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@redhat.com>2008-10-29 14:49:05 -0400
committerChris Mason <chris.mason@oracle.com>2008-10-29 14:49:05 -0400
commit80eb234af09dbe6c97b2e3d60a13ec391e98fbba (patch)
treeb7641c2ae006866f4b9bb6ca9b9f02ae69e00426 /fs
parentf82d02d9d8222183b7945e893111a6d1bf67ae4a (diff)
Btrfs: fix enospc when there is plenty of space
So there is an odd case where we can possibly return -ENOSPC when there is in fact space to be had. It only happens with Metadata writes, and happens _very_ infrequently. What has to happen is we have to allocate have allocated out of the first logical byte on the disk, which would set last_alloc to first_logical_byte(root, 0), so search_start == orig_search_start. We then need to allocate for normal metadata, so BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DUP. We will do a block lookup for the given search_start, block_group_bits() won't match and we'll go to choose another block group. However because search_start matches orig_search_start we go to see if we can allocate a chunk. If we are in the situation that we cannot allocate a chunk, we fail and ENOSPC. This is kind of a big flaw of the way find_free_extent works, as it along with find_free_space loop through _all_ of the block groups, not just the ones that we want to allocate out of. This patch completely kills find_free_space and rolls it into find_free_extent. I've introduced a sort of state machine into this, which will make it easier to get cache miss information out of the allocator, and will work well with my locking changes. The basic flow is this: We have the variable loop which is 0, meaning we are in the hint phase. We lookup the block group for the hint, and lookup the space_info for what we want to allocate out of. If the block group we were pointed at by the hint either isn't of the correct type, or just doesn't have the space we need, we set head to space_info->block_groups, so we start at the beginning of the block groups for this particular space info, and loop through. This is also where we add the empty_cluster to total_needed. At this point loop is set to 1 and we just loop through all of the block groups for this particular space_info looking for the space we need, just as find_free_space would have done, except we only hit the block groups we want and not _all_ of the block groups. If we come full circle we see if we can allocate a chunk. If we cannot of course we exit with -ENOSPC and we are good. If not we start over at space_info->block_groups and loop through again, with loop == 2. If we come full circle and haven't found what we need then we exit with -ENOSPC. I've been running this for a couple of days now and it seems stable, and I haven't yet hit a -ENOSPC when there was plenty of space left. Also I've made a groups_sem to handle the group list for the space_info. This is part of my locking changes, but is relatively safe and seems better than holding the space_info spinlock over that entire search time. Thanks, Signed-off-by: Josef Bacik <jbacik@redhat.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/ctree.h1
-rw-r--r--fs/btrfs/extent-tree.c322
2 files changed, 132 insertions, 191 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 117090995e7c..caa860a1c3e5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -544,6 +544,7 @@ struct btrfs_space_info {
544 /* for block groups in our same type */ 544 /* for block groups in our same type */
545 struct list_head block_groups; 545 struct list_head block_groups;
546 spinlock_t lock; 546 spinlock_t lock;
547 struct rw_semaphore groups_sem;
547}; 548};
548 549
549struct btrfs_free_space { 550struct btrfs_free_space {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 56e41369d713..e3b3e13a4817 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -317,59 +317,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
317 return cache; 317 return cache;
318} 318}
319 319
320static int noinline find_free_space(struct btrfs_root *root,
321 struct btrfs_block_group_cache **cache_ret,
322 u64 *start_ret, u64 num, int data)
323{
324 int ret;
325 struct btrfs_block_group_cache *cache = *cache_ret;
326 struct btrfs_free_space *info = NULL;
327 u64 last;
328 u64 search_start = *start_ret;
329
330 WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
331 if (!cache)
332 goto out;
333
334 last = max(search_start, cache->key.objectid);
335
336again:
337 ret = cache_block_group(root, cache);
338 if (ret)
339 goto out;
340
341 if (cache->ro || !block_group_bits(cache, data))
342 goto new_group;
343
344 info = btrfs_find_free_space(cache, last, num);
345 if (info) {
346 *start_ret = info->offset;
347 return 0;
348 }
349
350new_group:
351 last = cache->key.objectid + cache->key.offset;
352
353 cache = btrfs_lookup_first_block_group(root->fs_info, last);
354 if (!cache)
355 goto out;
356
357 *cache_ret = cache;
358 goto again;
359
360out:
361 return -ENOSPC;
362}
363
364static u64 div_factor(u64 num, int factor)
365{
366 if (factor == 10)
367 return num;
368 num *= factor;
369 do_div(num, 10);
370 return num;
371}
372
373static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 320static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
374 u64 flags) 321 u64 flags)
375{ 322{
@@ -384,6 +331,15 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
384 return NULL; 331 return NULL;
385} 332}
386 333
334static u64 div_factor(u64 num, int factor)
335{
336 if (factor == 10)
337 return num;
338 num *= factor;
339 do_div(num, 10);
340 return num;
341}
342
387static struct btrfs_block_group_cache * 343static struct btrfs_block_group_cache *
388__btrfs_find_block_group(struct btrfs_root *root, 344__btrfs_find_block_group(struct btrfs_root *root,
389 struct btrfs_block_group_cache *hint, 345 struct btrfs_block_group_cache *hint,
@@ -1446,6 +1402,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1446 1402
1447 list_add(&found->list, &info->space_info); 1403 list_add(&found->list, &info->space_info);
1448 INIT_LIST_HEAD(&found->block_groups); 1404 INIT_LIST_HEAD(&found->block_groups);
1405 init_rwsem(&found->groups_sem);
1449 spin_lock_init(&found->lock); 1406 spin_lock_init(&found->lock);
1450 found->flags = flags; 1407 found->flags = flags;
1451 found->total_bytes = total_bytes; 1408 found->total_bytes = total_bytes;
@@ -2208,19 +2165,22 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
2208 u64 exclude_start, u64 exclude_nr, 2165 u64 exclude_start, u64 exclude_nr,
2209 int data) 2166 int data)
2210{ 2167{
2211 int ret; 2168 int ret = 0;
2212 u64 orig_search_start;
2213 struct btrfs_root * root = orig_root->fs_info->extent_root; 2169 struct btrfs_root * root = orig_root->fs_info->extent_root;
2214 struct btrfs_fs_info *info = root->fs_info;
2215 u64 total_needed = num_bytes; 2170 u64 total_needed = num_bytes;
2216 u64 *last_ptr = NULL; 2171 u64 *last_ptr = NULL;
2217 struct btrfs_block_group_cache *block_group; 2172 struct btrfs_block_group_cache *block_group = NULL;
2218 int chunk_alloc_done = 0; 2173 int chunk_alloc_done = 0;
2219 int empty_cluster = 2 * 1024 * 1024; 2174 int empty_cluster = 2 * 1024 * 1024;
2220 int allowed_chunk_alloc = 0; 2175 int allowed_chunk_alloc = 0;
2176 struct list_head *head = NULL, *cur = NULL;
2177 int loop = 0;
2178 struct btrfs_space_info *space_info;
2221 2179
2222 WARN_ON(num_bytes < root->sectorsize); 2180 WARN_ON(num_bytes < root->sectorsize);
2223 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 2181 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
2182 ins->objectid = 0;
2183 ins->offset = 0;
2224 2184
2225 if (orig_root->ref_cows || empty_size) 2185 if (orig_root->ref_cows || empty_size)
2226 allowed_chunk_alloc = 1; 2186 allowed_chunk_alloc = 1;
@@ -2239,152 +2199,132 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
2239 else 2199 else
2240 empty_size += empty_cluster; 2200 empty_size += empty_cluster;
2241 } 2201 }
2242
2243 search_start = max(search_start, first_logical_byte(root, 0)); 2202 search_start = max(search_start, first_logical_byte(root, 0));
2244 orig_search_start = search_start;
2245
2246 search_start = max(search_start, hint_byte); 2203 search_start = max(search_start, hint_byte);
2247 total_needed += empty_size; 2204 total_needed += empty_size;
2248 2205
2249new_group: 2206 block_group = btrfs_lookup_block_group(root->fs_info, search_start);
2250 block_group = btrfs_lookup_block_group(info, search_start); 2207 space_info = __find_space_info(root->fs_info, data);
2251 if (!block_group)
2252 block_group = btrfs_lookup_first_block_group(info,
2253 search_start);
2254 2208
2255 /* 2209 down_read(&space_info->groups_sem);
2256 * Ok this looks a little tricky, buts its really simple. First if we 2210 while (1) {
2257 * didn't find a block group obviously we want to start over. 2211 struct btrfs_free_space *free_space;
2258 * Secondly, if the block group we found does not match the type we
2259 * need, and we have a last_ptr and its not 0, chances are the last
2260 * allocation we made was at the end of the block group, so lets go
2261 * ahead and skip the looking through the rest of the block groups and
2262 * start at the beginning. This helps with metadata allocations,
2263 * since you are likely to have a bunch of data block groups to search
2264 * through first before you realize that you need to start over, so go
2265 * ahead and start over and save the time.
2266 */
2267 if (!block_group || (!block_group_bits(block_group, data) &&
2268 last_ptr && *last_ptr)) {
2269 if (search_start != orig_search_start) {
2270 if (last_ptr && *last_ptr) {
2271 total_needed += empty_cluster;
2272 *last_ptr = 0;
2273 }
2274 search_start = orig_search_start;
2275 goto new_group;
2276 } else if (!chunk_alloc_done && allowed_chunk_alloc) {
2277 ret = do_chunk_alloc(trans, root,
2278 num_bytes + 2 * 1024 * 1024,
2279 data, 1);
2280 if (ret < 0)
2281 goto error;
2282 BUG_ON(ret);
2283 chunk_alloc_done = 1;
2284 search_start = orig_search_start;
2285 goto new_group;
2286 } else {
2287 ret = -ENOSPC;
2288 goto error;
2289 }
2290 }
2291
2292 /*
2293 * this is going to seach through all of the existing block groups it
2294 * can find, so if we don't find something we need to see if we can
2295 * allocate what we need.
2296 */
2297 ret = find_free_space(root, &block_group, &search_start,
2298 total_needed, data);
2299 if (ret == -ENOSPC) {
2300 /* 2212 /*
2301 * instead of allocating, start at the original search start 2213 * the only way this happens if our hint points to a block
2302 * and see if there is something to be found, if not then we 2214 * group thats not of the proper type, while looping this
2303 * allocate 2215 * should never happen
2304 */ 2216 */
2305 if (search_start != orig_search_start) { 2217 if (unlikely(!block_group_bits(block_group, data)))
2306 if (last_ptr && *last_ptr) {
2307 *last_ptr = 0;
2308 total_needed += empty_cluster;
2309 }
2310 search_start = orig_search_start;
2311 goto new_group; 2218 goto new_group;
2312 }
2313 2219
2314 /* 2220 ret = cache_block_group(root, block_group);
2315 * we've already allocated, we're pretty screwed 2221 if (ret)
2316 */ 2222 break;
2317 if (chunk_alloc_done) {
2318 goto error;
2319 } else if (!allowed_chunk_alloc && block_group &&
2320 block_group_bits(block_group, data)) {
2321 block_group->space_info->force_alloc = 1;
2322 goto error;
2323 } else if (!allowed_chunk_alloc) {
2324 goto error;
2325 }
2326 2223
2327 ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024, 2224 if (block_group->ro)
2328 data, 1); 2225 goto new_group;
2329 if (ret < 0)
2330 goto error;
2331 2226
2332 BUG_ON(ret); 2227 free_space = btrfs_find_free_space(block_group, search_start,
2333 chunk_alloc_done = 1; 2228 total_needed);
2334 if (block_group) 2229 if (free_space) {
2335 search_start = block_group->key.objectid + 2230 u64 start = block_group->key.objectid;
2231 u64 end = block_group->key.objectid +
2336 block_group->key.offset; 2232 block_group->key.offset;
2337 else
2338 search_start = orig_search_start;
2339 goto new_group;
2340 }
2341 2233
2342 if (ret) 2234 search_start = stripe_align(root, free_space->offset);
2343 goto error;
2344 2235
2345 search_start = stripe_align(root, search_start); 2236 /* move on to the next group */
2346 ins->objectid = search_start; 2237 if (search_start + num_bytes >= search_end)
2347 ins->offset = num_bytes; 2238 goto new_group;
2348 2239
2349 if (ins->objectid + num_bytes >= search_end) { 2240 /* move on to the next group */
2350 search_start = orig_search_start; 2241 if (search_start + num_bytes > end)
2351 if (chunk_alloc_done) { 2242 goto new_group;
2352 ret = -ENOSPC; 2243
2353 goto error; 2244 if (exclude_nr > 0 &&
2245 (search_start + num_bytes > exclude_start &&
2246 search_start < exclude_start + exclude_nr)) {
2247 search_start = exclude_start + exclude_nr;
2248 /*
2249 * if search_start is still in this block group
2250 * then we just re-search this block group
2251 */
2252 if (search_start >= start &&
2253 search_start < end)
2254 continue;
2255
2256 /* else we go to the next block group */
2257 goto new_group;
2258 }
2259
2260 ins->objectid = search_start;
2261 ins->offset = num_bytes;
2262 /* we are all good, lets return */
2263 break;
2354 } 2264 }
2355 goto new_group; 2265new_group:
2356 } 2266 /*
2267 * Here's how this works.
2268 * loop == 0: we were searching a block group via a hint
2269 * and didn't find anything, so we start at
2270 * the head of the block groups and keep searching
2271 * loop == 1: we're searching through all of the block groups
2272 * if we hit the head again we have searched
2273 * all of the block groups for this space and we
2274 * need to try and allocate, if we cant error out.
2275 * loop == 2: we allocated more space and are looping through
2276 * all of the block groups again.
2277 */
2278 if (loop == 0) {
2279 head = &space_info->block_groups;
2280 cur = head->next;
2357 2281
2358 if (ins->objectid + num_bytes > 2282 if (last_ptr && *last_ptr) {
2359 block_group->key.objectid + block_group->key.offset) { 2283 total_needed += empty_cluster;
2360 if (search_start == orig_search_start && chunk_alloc_done) { 2284 *last_ptr = 0;
2361 ret = -ENOSPC; 2285 }
2362 goto error; 2286 loop++;
2287 } else if (loop == 1 && cur == head) {
2288 if (allowed_chunk_alloc && !chunk_alloc_done) {
2289 up_read(&space_info->groups_sem);
2290 ret = do_chunk_alloc(trans, root, num_bytes +
2291 2 * 1024 * 1024, data, 1);
2292 if (ret < 0)
2293 break;
2294 down_read(&space_info->groups_sem);
2295 loop++;
2296 head = &space_info->block_groups;
2297 cur = head->next;
2298 chunk_alloc_done = 1;
2299 } else if (!allowed_chunk_alloc) {
2300 space_info->force_alloc = 1;
2301 break;
2302 } else {
2303 break;
2304 }
2305 } else if (cur == head) {
2306 break;
2363 } 2307 }
2364 search_start = block_group->key.objectid +
2365 block_group->key.offset;
2366 goto new_group;
2367 }
2368 2308
2369 if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start && 2309 block_group = list_entry(cur, struct btrfs_block_group_cache,
2370 ins->objectid < exclude_start + exclude_nr)) { 2310 list);
2371 search_start = exclude_start + exclude_nr; 2311 search_start = block_group->key.objectid;
2372 goto new_group; 2312 cur = cur->next;
2373 } 2313 }
2374 2314
2375 if (!(data & BTRFS_BLOCK_GROUP_DATA)) 2315 /* we found what we needed */
2376 trans->block_group = block_group; 2316 if (ins->objectid) {
2317 if (!(data & BTRFS_BLOCK_GROUP_DATA))
2318 trans->block_group = block_group;
2377 2319
2378 ins->offset = num_bytes; 2320 if (last_ptr)
2379 if (last_ptr) { 2321 *last_ptr = ins->objectid + ins->offset;
2380 *last_ptr = ins->objectid + ins->offset; 2322 ret = 0;
2381 if (*last_ptr == 2323 } else if (!ret) {
2382 btrfs_super_total_bytes(&root->fs_info->super_copy)) 2324 ret = -ENOSPC;
2383 *last_ptr = 0;
2384 } 2325 }
2385 2326
2386 ret = 0; 2327 up_read(&space_info->groups_sem);
2387error:
2388 return ret; 2328 return ret;
2389} 2329}
2390 2330
@@ -2397,7 +2337,7 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
2397 info->total_bytes - info->bytes_used - info->bytes_pinned - 2337 info->total_bytes - info->bytes_used - info->bytes_pinned -
2398 info->bytes_reserved, (info->full) ? "" : "not "); 2338 info->bytes_reserved, (info->full) ? "" : "not ");
2399 2339
2400 spin_lock(&info->lock); 2340 down_read(&info->groups_sem);
2401 list_for_each(l, &info->block_groups) { 2341 list_for_each(l, &info->block_groups) {
2402 cache = list_entry(l, struct btrfs_block_group_cache, list); 2342 cache = list_entry(l, struct btrfs_block_group_cache, list);
2403 spin_lock(&cache->lock); 2343 spin_lock(&cache->lock);
@@ -2409,7 +2349,7 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
2409 btrfs_dump_free_space(cache, bytes); 2349 btrfs_dump_free_space(cache, bytes);
2410 spin_unlock(&cache->lock); 2350 spin_unlock(&cache->lock);
2411 } 2351 }
2412 spin_unlock(&info->lock); 2352 up_read(&info->groups_sem);
2413} 2353}
2414 2354
2415static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, 2355static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
@@ -5186,9 +5126,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
5186 5126
5187 rb_erase(&block_group->cache_node, 5127 rb_erase(&block_group->cache_node,
5188 &info->block_group_cache_tree); 5128 &info->block_group_cache_tree);
5189 spin_lock(&block_group->space_info->lock); 5129 down_write(&block_group->space_info->groups_sem);
5190 list_del(&block_group->list); 5130 list_del(&block_group->list);
5191 spin_unlock(&block_group->space_info->lock); 5131 up_write(&block_group->space_info->groups_sem);
5192 kfree(block_group); 5132 kfree(block_group);
5193 } 5133 }
5194 spin_unlock(&info->block_group_cache_lock); 5134 spin_unlock(&info->block_group_cache_lock);
@@ -5249,9 +5189,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
5249 &space_info); 5189 &space_info);
5250 BUG_ON(ret); 5190 BUG_ON(ret);
5251 cache->space_info = space_info; 5191 cache->space_info = space_info;
5252 spin_lock(&space_info->lock); 5192 down_write(&space_info->groups_sem);
5253 list_add(&cache->list, &space_info->block_groups); 5193 list_add_tail(&cache->list, &space_info->block_groups);
5254 spin_unlock(&space_info->lock); 5194 up_write(&space_info->groups_sem);
5255 5195
5256 ret = btrfs_add_block_group_cache(root->fs_info, cache); 5196 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5257 BUG_ON(ret); 5197 BUG_ON(ret);
@@ -5297,9 +5237,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
5297 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 5237 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
5298 &cache->space_info); 5238 &cache->space_info);
5299 BUG_ON(ret); 5239 BUG_ON(ret);
5300 spin_lock(&cache->space_info->lock); 5240 down_write(&cache->space_info->groups_sem);
5301 list_add(&cache->list, &cache->space_info->block_groups); 5241 list_add_tail(&cache->list, &cache->space_info->block_groups);
5302 spin_unlock(&cache->space_info->lock); 5242 up_write(&cache->space_info->groups_sem);
5303 5243
5304 ret = btrfs_add_block_group_cache(root->fs_info, cache); 5244 ret = btrfs_add_block_group_cache(root->fs_info, cache);
5305 BUG_ON(ret); 5245 BUG_ON(ret);
@@ -5338,9 +5278,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
5338 btrfs_remove_free_space_cache(block_group); 5278 btrfs_remove_free_space_cache(block_group);
5339 rb_erase(&block_group->cache_node, 5279 rb_erase(&block_group->cache_node,
5340 &root->fs_info->block_group_cache_tree); 5280 &root->fs_info->block_group_cache_tree);
5341 spin_lock(&block_group->space_info->lock); 5281 down_write(&block_group->space_info->groups_sem);
5342 list_del(&block_group->list); 5282 list_del(&block_group->list);
5343 spin_unlock(&block_group->space_info->lock); 5283 up_write(&block_group->space_info->groups_sem);
5344 5284
5345 /* 5285 /*
5346 memset(shrink_block_group, 0, sizeof(*shrink_block_group)); 5286 memset(shrink_block_group, 0, sizeof(*shrink_block_group));