aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/free-space-cache.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2009-04-03 09:47:43 -0400
committerChris Mason <chris.mason@oracle.com>2009-04-03 09:47:43 -0400
commitfa9c0d795f7b57c76560b7fac703f5d341210e28 (patch)
tree74d9d9846e21ce5b99738f3cc13b855fb63d1eba /fs/btrfs/free-space-cache.c
parent8e73f275011b3264a87339fd9f1690e944e381c9 (diff)
Btrfs: rework allocation clustering
Because btrfs is copy-on-write, we end up picking new locations for blocks very often. This makes it fairly difficult to maintain perfect read patterns over time, but we can at least do some optimizations for writes. This is done today by remembering the last place we allocated and trying to find a free space hole big enough to hold more than just one allocation. The end result is that we tend to write sequentially to the drive. This happens all the time for metadata and it happens for data when mounted -o ssd. But, the way we record it is fairly racey and it tends to fragment the free space over time because we are trying to allocate fairly large areas at once. This commit gets rid of the races by adding a free space cluster object with dedicated locking to make sure that only one process at a time is out replacing the cluster. The free space fragmentation is somewhat solved by allowing a cluster to be comprised of smaller free space extents. This part definitely adds some CPU time to the cluster allocations, but it allows the allocator to consume the small holes left behind by cow. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/free-space-cache.c')
-rw-r--r--fs/btrfs/free-space-cache.c297
1 files changed, 297 insertions, 0 deletions
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index df19b60eef61..3fdadd28e935 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,15 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include "ctree.h" 20#include "ctree.h"
21#include "free-space-cache.h"
22#include "transaction.h"
23
24struct btrfs_free_space {
25 struct rb_node bytes_index;
26 struct rb_node offset_index;
27 u64 offset;
28 u64 bytes;
29};
21 30
22static int tree_insert_offset(struct rb_root *root, u64 offset, 31static int tree_insert_offset(struct rb_root *root, u64 offset,
23 struct rb_node *node) 32 struct rb_node *node)
@@ -371,12 +380,58 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
371 return ret; 380 return ret;
372} 381}
373 382
383/*
384 * for a given cluster, put all of its extents back into the free
385 * space cache. If the block group passed doesn't match the block group
386 * pointed to by the cluster, someone else raced in and freed the
387 * cluster already. In that case, we just return without changing anything
388 */
389static int
390__btrfs_return_cluster_to_free_space(
391 struct btrfs_block_group_cache *block_group,
392 struct btrfs_free_cluster *cluster)
393{
394 struct btrfs_free_space *entry;
395 struct rb_node *node;
396
397 spin_lock(&cluster->lock);
398 if (cluster->block_group != block_group)
399 goto out;
400
401 cluster->window_start = 0;
402 node = rb_first(&cluster->root);
403 while(node) {
404 entry = rb_entry(node, struct btrfs_free_space, offset_index);
405 node = rb_next(&entry->offset_index);
406 rb_erase(&entry->offset_index, &cluster->root);
407 link_free_space(block_group, entry);
408 }
409 list_del_init(&cluster->block_group_list);
410
411 btrfs_put_block_group(cluster->block_group);
412 cluster->block_group = NULL;
413 cluster->root.rb_node = NULL;
414out:
415 spin_unlock(&cluster->lock);
416 return 0;
417}
418
374void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 419void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
375{ 420{
376 struct btrfs_free_space *info; 421 struct btrfs_free_space *info;
377 struct rb_node *node; 422 struct rb_node *node;
423 struct btrfs_free_cluster *cluster;
424 struct btrfs_free_cluster *safe;
378 425
379 spin_lock(&block_group->tree_lock); 426 spin_lock(&block_group->tree_lock);
427
428 list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
429 block_group_list) {
430
431 WARN_ON(cluster->block_group != block_group);
432 __btrfs_return_cluster_to_free_space(block_group, cluster);
433 }
434
380 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { 435 while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
381 info = rb_entry(node, struct btrfs_free_space, bytes_index); 436 info = rb_entry(node, struct btrfs_free_space, bytes_index);
382 unlink_free_space(block_group, info); 437 unlink_free_space(block_group, info);
@@ -417,3 +472,245 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
417 472
418 return ret; 473 return ret;
419} 474}
475
476/*
477 * given a cluster, put all of its extents back into the free space
478 * cache. If a block group is passed, this function will only free
479 * a cluster that belongs to the passed block group.
480 *
481 * Otherwise, it'll get a reference on the block group pointed to by the
482 * cluster and remove the cluster from it.
483 */
484int btrfs_return_cluster_to_free_space(
485 struct btrfs_block_group_cache *block_group,
486 struct btrfs_free_cluster *cluster)
487{
488 int ret;
489
490 /* first, get a safe pointer to the block group */
491 spin_lock(&cluster->lock);
492 if (!block_group) {
493 block_group = cluster->block_group;
494 if (!block_group) {
495 spin_unlock(&cluster->lock);
496 return 0;
497 }
498 } else if (cluster->block_group != block_group) {
499 /* someone else has already freed it don't redo their work */
500 spin_unlock(&cluster->lock);
501 return 0;
502 }
503 atomic_inc(&block_group->count);
504 spin_unlock(&cluster->lock);
505
506 /* now return any extents the cluster had on it */
507 spin_lock(&block_group->tree_lock);
508 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
509 spin_unlock(&block_group->tree_lock);
510
511 /* finally drop our ref */
512 btrfs_put_block_group(block_group);
513 return ret;
514}
515
516/*
517 * given a cluster, try to allocate 'bytes' from it, returns 0
518 * if it couldn't find anything suitably large, or a logical disk offset
519 * if things worked out
520 */
521u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
522 struct btrfs_free_cluster *cluster, u64 bytes,
523 u64 min_start)
524{
525 struct btrfs_free_space *entry = NULL;
526 struct rb_node *node;
527 u64 ret = 0;
528
529 spin_lock(&cluster->lock);
530 if (bytes > cluster->max_size)
531 goto out;
532
533 if (cluster->block_group != block_group)
534 goto out;
535
536 node = rb_first(&cluster->root);
537 if (!node)
538 goto out;
539
540 entry = rb_entry(node, struct btrfs_free_space, offset_index);
541
542 while(1) {
543 if (entry->bytes < bytes || entry->offset < min_start) {
544 struct rb_node *node;
545
546 node = rb_next(&entry->offset_index);
547 if (!node)
548 break;
549 entry = rb_entry(node, struct btrfs_free_space,
550 offset_index);
551 continue;
552 }
553 ret = entry->offset;
554
555 entry->offset += bytes;
556 entry->bytes -= bytes;
557
558 if (entry->bytes == 0) {
559 rb_erase(&entry->offset_index, &cluster->root);
560 kfree(entry);
561 }
562 break;
563 }
564out:
565 spin_unlock(&cluster->lock);
566 return ret;
567}
568
569/*
570 * here we try to find a cluster of blocks in a block group. The goal
571 * is to find at least bytes free and up to empty_size + bytes free.
572 * We might not find them all in one contiguous area.
573 *
574 * returns zero and sets up cluster if things worked out, otherwise
575 * it returns -enospc
576 */
577int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
578 struct btrfs_block_group_cache *block_group,
579 struct btrfs_free_cluster *cluster,
580 u64 offset, u64 bytes, u64 empty_size)
581{
582 struct btrfs_free_space *entry = NULL;
583 struct rb_node *node;
584 struct btrfs_free_space *next;
585 struct btrfs_free_space *last;
586 u64 min_bytes;
587 u64 window_start;
588 u64 window_free;
589 u64 max_extent = 0;
590 int total_retries = 0;
591 int ret;
592
593 /* for metadata, allow allocates with more holes */
594 if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
595 /*
596 * we want to do larger allocations when we are
597 * flushing out the delayed refs, it helps prevent
598 * making more work as we go along.
599 */
600 if (trans->transaction->delayed_refs.flushing)
601 min_bytes = max(bytes, (bytes + empty_size) >> 1);
602 else
603 min_bytes = max(bytes, (bytes + empty_size) >> 4);
604 } else
605 min_bytes = max(bytes, (bytes + empty_size) >> 2);
606
607 spin_lock(&block_group->tree_lock);
608 spin_lock(&cluster->lock);
609
610 /* someone already found a cluster, hooray */
611 if (cluster->block_group) {
612 ret = 0;
613 goto out;
614 }
615again:
616 min_bytes = min(min_bytes, bytes + empty_size);
617 entry = tree_search_bytes(&block_group->free_space_bytes,
618 offset, min_bytes);
619 if (!entry) {
620 ret = -ENOSPC;
621 goto out;
622 }
623 window_start = entry->offset;
624 window_free = entry->bytes;
625 last = entry;
626 max_extent = entry->bytes;
627
628 while(1) {
629 /* out window is just right, lets fill it */
630 if (window_free >= bytes + empty_size)
631 break;
632
633 node = rb_next(&last->offset_index);
634 if (!node) {
635 ret = -ENOSPC;
636 goto out;
637 }
638 next = rb_entry(node, struct btrfs_free_space, offset_index);
639
640 /*
641 * we haven't filled the empty size and the window is
642 * very large. reset and try again
643 */
644 if (next->offset - window_start > (bytes + empty_size) * 2) {
645 entry = next;
646 window_start = entry->offset;
647 window_free = entry->bytes;
648 last = entry;
649 max_extent = 0;
650 total_retries++;
651 if (total_retries % 256 == 0) {
652 if (min_bytes >= (bytes + empty_size)) {
653 ret = -ENOSPC;
654 goto out;
655 }
656 /*
657 * grow our allocation a bit, we're not having
658 * much luck
659 */
660 min_bytes *= 2;
661 goto again;
662 }
663 } else {
664 last = next;
665 window_free += next->bytes;
666 if (entry->bytes > max_extent)
667 max_extent = entry->bytes;
668 }
669 }
670
671 cluster->window_start = entry->offset;
672
673 /*
674 * now we've found our entries, pull them out of the free space
675 * cache and put them into the cluster rbtree
676 *
677 * The cluster includes an rbtree, but only uses the offset index
678 * of each free space cache entry.
679 */
680 while(1) {
681 node = rb_next(&entry->offset_index);
682 unlink_free_space(block_group, entry);
683 ret = tree_insert_offset(&cluster->root, entry->offset,
684 &entry->offset_index);
685 BUG_ON(ret);
686
687 if (!node || entry == last)
688 break;
689
690 entry = rb_entry(node, struct btrfs_free_space, offset_index);
691 }
692 ret = 0;
693 cluster->max_size = max_extent;
694 atomic_inc(&block_group->count);
695 list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
696 cluster->block_group = block_group;
697out:
698 spin_unlock(&cluster->lock);
699 spin_unlock(&block_group->tree_lock);
700
701 return ret;
702}
703
704/*
705 * simple code to zero out a cluster
706 */
707void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
708{
709 spin_lock_init(&cluster->lock);
710 spin_lock_init(&cluster->refill_lock);
711 cluster->root.rb_node = NULL;
712 cluster->max_size = 0;
713 INIT_LIST_HEAD(&cluster->block_group_list);
714 cluster->block_group = NULL;
715}
716