aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/volumes.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r--fs/btrfs/volumes.c688
1 files changed, 504 insertions, 184 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6b9884507837..dd13eb81ee40 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include "compat.h" 27#include "compat.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -493,7 +494,7 @@ again:
493 continue; 494 continue;
494 495
495 if (device->bdev) { 496 if (device->bdev) {
496 close_bdev_exclusive(device->bdev, device->mode); 497 blkdev_put(device->bdev, device->mode);
497 device->bdev = NULL; 498 device->bdev = NULL;
498 fs_devices->open_devices--; 499 fs_devices->open_devices--;
499 } 500 }
@@ -527,7 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
527 528
528 list_for_each_entry(device, &fs_devices->devices, dev_list) { 529 list_for_each_entry(device, &fs_devices->devices, dev_list) {
529 if (device->bdev) { 530 if (device->bdev) {
530 close_bdev_exclusive(device->bdev, device->mode); 531 blkdev_put(device->bdev, device->mode);
531 fs_devices->open_devices--; 532 fs_devices->open_devices--;
532 } 533 }
533 if (device->writeable) { 534 if (device->writeable) {
@@ -584,13 +585,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
584 int seeding = 1; 585 int seeding = 1;
585 int ret = 0; 586 int ret = 0;
586 587
588 flags |= FMODE_EXCL;
589
587 list_for_each_entry(device, head, dev_list) { 590 list_for_each_entry(device, head, dev_list) {
588 if (device->bdev) 591 if (device->bdev)
589 continue; 592 continue;
590 if (!device->name) 593 if (!device->name)
591 continue; 594 continue;
592 595
593 bdev = open_bdev_exclusive(device->name, flags, holder); 596 bdev = blkdev_get_by_path(device->name, flags, holder);
594 if (IS_ERR(bdev)) { 597 if (IS_ERR(bdev)) {
595 printk(KERN_INFO "open %s failed\n", device->name); 598 printk(KERN_INFO "open %s failed\n", device->name);
596 goto error; 599 goto error;
@@ -598,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
598 set_blocksize(bdev, 4096); 601 set_blocksize(bdev, 4096);
599 602
600 bh = btrfs_read_dev_super(bdev); 603 bh = btrfs_read_dev_super(bdev);
601 if (!bh) 604 if (!bh) {
605 ret = -EINVAL;
602 goto error_close; 606 goto error_close;
607 }
603 608
604 disk_super = (struct btrfs_super_block *)bh->b_data; 609 disk_super = (struct btrfs_super_block *)bh->b_data;
605 devid = btrfs_stack_device_id(&disk_super->dev_item); 610 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -642,7 +647,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
642error_brelse: 647error_brelse:
643 brelse(bh); 648 brelse(bh);
644error_close: 649error_close:
645 close_bdev_exclusive(bdev, FMODE_READ); 650 blkdev_put(bdev, flags);
646error: 651error:
647 continue; 652 continue;
648 } 653 }
@@ -688,7 +693,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
688 693
689 mutex_lock(&uuid_mutex); 694 mutex_lock(&uuid_mutex);
690 695
691 bdev = open_bdev_exclusive(path, flags, holder); 696 flags |= FMODE_EXCL;
697 bdev = blkdev_get_by_path(path, flags, holder);
692 698
693 if (IS_ERR(bdev)) { 699 if (IS_ERR(bdev)) {
694 ret = PTR_ERR(bdev); 700 ret = PTR_ERR(bdev);
@@ -700,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
700 goto error_close; 706 goto error_close;
701 bh = btrfs_read_dev_super(bdev); 707 bh = btrfs_read_dev_super(bdev);
702 if (!bh) { 708 if (!bh) {
703 ret = -EIO; 709 ret = -EINVAL;
704 goto error_close; 710 goto error_close;
705 } 711 }
706 disk_super = (struct btrfs_super_block *)bh->b_data; 712 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -720,65 +726,173 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
720 726
721 brelse(bh); 727 brelse(bh);
722error_close: 728error_close:
723 close_bdev_exclusive(bdev, flags); 729 blkdev_put(bdev, flags);
724error: 730error:
725 mutex_unlock(&uuid_mutex); 731 mutex_unlock(&uuid_mutex);
726 return ret; 732 return ret;
727} 733}
728 734
735/* helper to account the used device space in the range */
736int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
737 u64 end, u64 *length)
738{
739 struct btrfs_key key;
740 struct btrfs_root *root = device->dev_root;
741 struct btrfs_dev_extent *dev_extent;
742 struct btrfs_path *path;
743 u64 extent_end;
744 int ret;
745 int slot;
746 struct extent_buffer *l;
747
748 *length = 0;
749
750 if (start >= device->total_bytes)
751 return 0;
752
753 path = btrfs_alloc_path();
754 if (!path)
755 return -ENOMEM;
756 path->reada = 2;
757
758 key.objectid = device->devid;
759 key.offset = start;
760 key.type = BTRFS_DEV_EXTENT_KEY;
761
762 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
763 if (ret < 0)
764 goto out;
765 if (ret > 0) {
766 ret = btrfs_previous_item(root, path, key.objectid, key.type);
767 if (ret < 0)
768 goto out;
769 }
770
771 while (1) {
772 l = path->nodes[0];
773 slot = path->slots[0];
774 if (slot >= btrfs_header_nritems(l)) {
775 ret = btrfs_next_leaf(root, path);
776 if (ret == 0)
777 continue;
778 if (ret < 0)
779 goto out;
780
781 break;
782 }
783 btrfs_item_key_to_cpu(l, &key, slot);
784
785 if (key.objectid < device->devid)
786 goto next;
787
788 if (key.objectid > device->devid)
789 break;
790
791 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
792 goto next;
793
794 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
795 extent_end = key.offset + btrfs_dev_extent_length(l,
796 dev_extent);
797 if (key.offset <= start && extent_end > end) {
798 *length = end - start + 1;
799 break;
800 } else if (key.offset <= start && extent_end > start)
801 *length += extent_end - start;
802 else if (key.offset > start && extent_end <= end)
803 *length += extent_end - key.offset;
804 else if (key.offset > start && key.offset <= end) {
805 *length += end - key.offset + 1;
806 break;
807 } else if (key.offset > end)
808 break;
809
810next:
811 path->slots[0]++;
812 }
813 ret = 0;
814out:
815 btrfs_free_path(path);
816 return ret;
817}
818
729/* 819/*
820 * find_free_dev_extent - find free space in the specified device
821 * @trans: transaction handler
822 * @device: the device which we search the free space in
823 * @num_bytes: the size of the free space that we need
824 * @start: store the start of the free space.
825 * @len: the size of the free space. that we find, or the size of the max
826 * free space if we don't find suitable free space
827 *
730 * this uses a pretty simple search, the expectation is that it is 828 * this uses a pretty simple search, the expectation is that it is
731 * called very infrequently and that a given device has a small number 829 * called very infrequently and that a given device has a small number
732 * of extents 830 * of extents
831 *
832 * @start is used to store the start of the free space if we find. But if we
833 * don't find suitable free space, it will be used to store the start position
834 * of the max free space.
835 *
836 * @len is used to store the size of the free space that we find.
837 * But if we don't find suitable free space, it is used to store the size of
838 * the max free space.
733 */ 839 */
734int find_free_dev_extent(struct btrfs_trans_handle *trans, 840int find_free_dev_extent(struct btrfs_trans_handle *trans,
735 struct btrfs_device *device, u64 num_bytes, 841 struct btrfs_device *device, u64 num_bytes,
736 u64 *start, u64 *max_avail) 842 u64 *start, u64 *len)
737{ 843{
738 struct btrfs_key key; 844 struct btrfs_key key;
739 struct btrfs_root *root = device->dev_root; 845 struct btrfs_root *root = device->dev_root;
740 struct btrfs_dev_extent *dev_extent = NULL; 846 struct btrfs_dev_extent *dev_extent;
741 struct btrfs_path *path; 847 struct btrfs_path *path;
742 u64 hole_size = 0; 848 u64 hole_size;
743 u64 last_byte = 0; 849 u64 max_hole_start;
744 u64 search_start = 0; 850 u64 max_hole_size;
851 u64 extent_end;
852 u64 search_start;
745 u64 search_end = device->total_bytes; 853 u64 search_end = device->total_bytes;
746 int ret; 854 int ret;
747 int slot = 0; 855 int slot;
748 int start_found;
749 struct extent_buffer *l; 856 struct extent_buffer *l;
750 857
751 path = btrfs_alloc_path();
752 if (!path)
753 return -ENOMEM;
754 path->reada = 2;
755 start_found = 0;
756
757 /* FIXME use last free of some kind */ 858 /* FIXME use last free of some kind */
758 859
759 /* we don't want to overwrite the superblock on the drive, 860 /* we don't want to overwrite the superblock on the drive,
760 * so we make sure to start at an offset of at least 1MB 861 * so we make sure to start at an offset of at least 1MB
761 */ 862 */
762 search_start = max((u64)1024 * 1024, search_start); 863 search_start = 1024 * 1024;
763 864
764 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 865 if (root->fs_info->alloc_start + num_bytes <= search_end)
765 search_start = max(root->fs_info->alloc_start, search_start); 866 search_start = max(root->fs_info->alloc_start, search_start);
766 867
868 max_hole_start = search_start;
869 max_hole_size = 0;
870
871 if (search_start >= search_end) {
872 ret = -ENOSPC;
873 goto error;
874 }
875
876 path = btrfs_alloc_path();
877 if (!path) {
878 ret = -ENOMEM;
879 goto error;
880 }
881 path->reada = 2;
882
767 key.objectid = device->devid; 883 key.objectid = device->devid;
768 key.offset = search_start; 884 key.offset = search_start;
769 key.type = BTRFS_DEV_EXTENT_KEY; 885 key.type = BTRFS_DEV_EXTENT_KEY;
886
770 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 887 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
771 if (ret < 0) 888 if (ret < 0)
772 goto error; 889 goto out;
773 if (ret > 0) { 890 if (ret > 0) {
774 ret = btrfs_previous_item(root, path, key.objectid, key.type); 891 ret = btrfs_previous_item(root, path, key.objectid, key.type);
775 if (ret < 0) 892 if (ret < 0)
776 goto error; 893 goto out;
777 if (ret > 0)
778 start_found = 1;
779 } 894 }
780 l = path->nodes[0]; 895
781 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
782 while (1) { 896 while (1) {
783 l = path->nodes[0]; 897 l = path->nodes[0];
784 slot = path->slots[0]; 898 slot = path->slots[0];
@@ -787,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
787 if (ret == 0) 901 if (ret == 0)
788 continue; 902 continue;
789 if (ret < 0) 903 if (ret < 0)
790 goto error; 904 goto out;
791no_more_items: 905
792 if (!start_found) { 906 break;
793 if (search_start >= search_end) {
794 ret = -ENOSPC;
795 goto error;
796 }
797 *start = search_start;
798 start_found = 1;
799 goto check_pending;
800 }
801 *start = last_byte > search_start ?
802 last_byte : search_start;
803 if (search_end <= *start) {
804 ret = -ENOSPC;
805 goto error;
806 }
807 goto check_pending;
808 } 907 }
809 btrfs_item_key_to_cpu(l, &key, slot); 908 btrfs_item_key_to_cpu(l, &key, slot);
810 909
@@ -812,48 +911,62 @@ no_more_items:
812 goto next; 911 goto next;
813 912
814 if (key.objectid > device->devid) 913 if (key.objectid > device->devid)
815 goto no_more_items; 914 break;
816 915
817 if (key.offset >= search_start && key.offset > last_byte && 916 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
818 start_found) { 917 goto next;
819 if (last_byte < search_start)
820 last_byte = search_start;
821 hole_size = key.offset - last_byte;
822 918
823 if (hole_size > *max_avail) 919 if (key.offset > search_start) {
824 *max_avail = hole_size; 920 hole_size = key.offset - search_start;
825 921
826 if (key.offset > last_byte && 922 if (hole_size > max_hole_size) {
827 hole_size >= num_bytes) { 923 max_hole_start = search_start;
828 *start = last_byte; 924 max_hole_size = hole_size;
829 goto check_pending; 925 }
926
927 /*
928 * If this free space is greater than which we need,
929 * it must be the max free space that we have found
930 * until now, so max_hole_start must point to the start
931 * of this free space and the length of this free space
932 * is stored in max_hole_size. Thus, we return
933 * max_hole_start and max_hole_size and go back to the
934 * caller.
935 */
936 if (hole_size >= num_bytes) {
937 ret = 0;
938 goto out;
830 } 939 }
831 } 940 }
832 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
833 goto next;
834 941
835 start_found = 1;
836 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 942 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
837 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 943 extent_end = key.offset + btrfs_dev_extent_length(l,
944 dev_extent);
945 if (extent_end > search_start)
946 search_start = extent_end;
838next: 947next:
839 path->slots[0]++; 948 path->slots[0]++;
840 cond_resched(); 949 cond_resched();
841 } 950 }
842check_pending:
843 /* we have to make sure we didn't find an extent that has already
844 * been allocated by the map tree or the original allocation
845 */
846 BUG_ON(*start < search_start);
847 951
848 if (*start + num_bytes > search_end) { 952 hole_size = search_end- search_start;
849 ret = -ENOSPC; 953 if (hole_size > max_hole_size) {
850 goto error; 954 max_hole_start = search_start;
955 max_hole_size = hole_size;
851 } 956 }
852 /* check for pending inserts here */
853 ret = 0;
854 957
855error: 958 /* See above. */
959 if (hole_size < num_bytes)
960 ret = -ENOSPC;
961 else
962 ret = 0;
963
964out:
856 btrfs_free_path(path); 965 btrfs_free_path(path);
966error:
967 *start = max_hole_start;
968 if (len)
969 *len = max_hole_size;
857 return ret; 970 return ret;
858} 971}
859 972
@@ -1100,6 +1213,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1100 return -ENOMEM; 1213 return -ENOMEM;
1101 1214
1102 trans = btrfs_start_transaction(root, 0); 1215 trans = btrfs_start_transaction(root, 0);
1216 if (IS_ERR(trans)) {
1217 btrfs_free_path(path);
1218 return PTR_ERR(trans);
1219 }
1103 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1220 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1104 key.type = BTRFS_DEV_ITEM_KEY; 1221 key.type = BTRFS_DEV_ITEM_KEY;
1105 key.offset = device->devid; 1222 key.offset = device->devid;
@@ -1183,8 +1300,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1183 goto out; 1300 goto out;
1184 } 1301 }
1185 } else { 1302 } else {
1186 bdev = open_bdev_exclusive(device_path, FMODE_READ, 1303 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1187 root->fs_info->bdev_holder); 1304 root->fs_info->bdev_holder);
1188 if (IS_ERR(bdev)) { 1305 if (IS_ERR(bdev)) {
1189 ret = PTR_ERR(bdev); 1306 ret = PTR_ERR(bdev);
1190 goto out; 1307 goto out;
@@ -1193,7 +1310,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1193 set_blocksize(bdev, 4096); 1310 set_blocksize(bdev, 4096);
1194 bh = btrfs_read_dev_super(bdev); 1311 bh = btrfs_read_dev_super(bdev);
1195 if (!bh) { 1312 if (!bh) {
1196 ret = -EIO; 1313 ret = -EINVAL;
1197 goto error_close; 1314 goto error_close;
1198 } 1315 }
1199 disk_super = (struct btrfs_super_block *)bh->b_data; 1316 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1221,11 +1338,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1221 1338
1222 ret = btrfs_shrink_device(device, 0); 1339 ret = btrfs_shrink_device(device, 0);
1223 if (ret) 1340 if (ret)
1224 goto error_brelse; 1341 goto error_undo;
1225 1342
1226 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1343 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1227 if (ret) 1344 if (ret)
1228 goto error_brelse; 1345 goto error_undo;
1229 1346
1230 device->in_fs_metadata = 0; 1347 device->in_fs_metadata = 0;
1231 1348
@@ -1251,7 +1368,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1251 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1368 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1252 1369
1253 if (device->bdev) { 1370 if (device->bdev) {
1254 close_bdev_exclusive(device->bdev, device->mode); 1371 blkdev_put(device->bdev, device->mode);
1255 device->bdev = NULL; 1372 device->bdev = NULL;
1256 device->fs_devices->open_devices--; 1373 device->fs_devices->open_devices--;
1257 } 1374 }
@@ -1294,11 +1411,18 @@ error_brelse:
1294 brelse(bh); 1411 brelse(bh);
1295error_close: 1412error_close:
1296 if (bdev) 1413 if (bdev)
1297 close_bdev_exclusive(bdev, FMODE_READ); 1414 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1298out: 1415out:
1299 mutex_unlock(&root->fs_info->volume_mutex); 1416 mutex_unlock(&root->fs_info->volume_mutex);
1300 mutex_unlock(&uuid_mutex); 1417 mutex_unlock(&uuid_mutex);
1301 return ret; 1418 return ret;
1419error_undo:
1420 if (device->writeable) {
1421 list_add(&device->dev_alloc_list,
1422 &root->fs_info->fs_devices->alloc_list);
1423 root->fs_info->fs_devices->rw_devices++;
1424 }
1425 goto error_brelse;
1302} 1426}
1303 1427
1304/* 1428/*
@@ -1446,7 +1570,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1446 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1570 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1447 return -EINVAL; 1571 return -EINVAL;
1448 1572
1449 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1573 bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
1574 root->fs_info->bdev_holder);
1450 if (IS_ERR(bdev)) 1575 if (IS_ERR(bdev))
1451 return PTR_ERR(bdev); 1576 return PTR_ERR(bdev);
1452 1577
@@ -1487,11 +1612,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1487 1612
1488 ret = find_next_devid(root, &device->devid); 1613 ret = find_next_devid(root, &device->devid);
1489 if (ret) { 1614 if (ret) {
1615 kfree(device->name);
1490 kfree(device); 1616 kfree(device);
1491 goto error; 1617 goto error;
1492 } 1618 }
1493 1619
1494 trans = btrfs_start_transaction(root, 0); 1620 trans = btrfs_start_transaction(root, 0);
1621 if (IS_ERR(trans)) {
1622 kfree(device->name);
1623 kfree(device);
1624 ret = PTR_ERR(trans);
1625 goto error;
1626 }
1627
1495 lock_chunks(root); 1628 lock_chunks(root);
1496 1629
1497 device->writeable = 1; 1630 device->writeable = 1;
@@ -1507,7 +1640,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1507 device->dev_root = root->fs_info->dev_root; 1640 device->dev_root = root->fs_info->dev_root;
1508 device->bdev = bdev; 1641 device->bdev = bdev;
1509 device->in_fs_metadata = 1; 1642 device->in_fs_metadata = 1;
1510 device->mode = 0; 1643 device->mode = FMODE_EXCL;
1511 set_blocksize(device->bdev, 4096); 1644 set_blocksize(device->bdev, 4096);
1512 1645
1513 if (seeding_dev) { 1646 if (seeding_dev) {
@@ -1572,7 +1705,7 @@ out:
1572 mutex_unlock(&root->fs_info->volume_mutex); 1705 mutex_unlock(&root->fs_info->volume_mutex);
1573 return ret; 1706 return ret;
1574error: 1707error:
1575 close_bdev_exclusive(bdev, 0); 1708 blkdev_put(bdev, FMODE_EXCL);
1576 if (seeding_dev) { 1709 if (seeding_dev) {
1577 mutex_unlock(&uuid_mutex); 1710 mutex_unlock(&uuid_mutex);
1578 up_write(&sb->s_umount); 1711 up_write(&sb->s_umount);
@@ -1759,7 +1892,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1759 return ret; 1892 return ret;
1760 1893
1761 trans = btrfs_start_transaction(root, 0); 1894 trans = btrfs_start_transaction(root, 0);
1762 BUG_ON(!trans); 1895 BUG_ON(IS_ERR(trans));
1763 1896
1764 lock_chunks(root); 1897 lock_chunks(root);
1765 1898
@@ -1912,6 +2045,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
1912 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2045 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1913 return -EROFS; 2046 return -EROFS;
1914 2047
2048 if (!capable(CAP_SYS_ADMIN))
2049 return -EPERM;
2050
1915 mutex_lock(&dev_root->fs_info->volume_mutex); 2051 mutex_lock(&dev_root->fs_info->volume_mutex);
1916 dev_root = dev_root->fs_info->dev_root; 2052 dev_root = dev_root->fs_info->dev_root;
1917 2053
@@ -1930,7 +2066,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1930 BUG_ON(ret); 2066 BUG_ON(ret);
1931 2067
1932 trans = btrfs_start_transaction(dev_root, 0); 2068 trans = btrfs_start_transaction(dev_root, 0);
1933 BUG_ON(!trans); 2069 BUG_ON(IS_ERR(trans));
1934 2070
1935 ret = btrfs_grow_device(trans, device, old_size); 2071 ret = btrfs_grow_device(trans, device, old_size);
1936 BUG_ON(ret); 2072 BUG_ON(ret);
@@ -2096,6 +2232,11 @@ again:
2096 2232
2097 /* Shrinking succeeded, else we would be at "done". */ 2233 /* Shrinking succeeded, else we would be at "done". */
2098 trans = btrfs_start_transaction(root, 0); 2234 trans = btrfs_start_transaction(root, 0);
2235 if (IS_ERR(trans)) {
2236 ret = PTR_ERR(trans);
2237 goto done;
2238 }
2239
2099 lock_chunks(root); 2240 lock_chunks(root);
2100 2241
2101 device->disk_total_bytes = new_size; 2242 device->disk_total_bytes = new_size;
@@ -2150,66 +2291,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
2150 return calc_size * num_stripes; 2291 return calc_size * num_stripes;
2151} 2292}
2152 2293
2153static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2294/* Used to sort the devices by max_avail(descending sort) */
2154 struct btrfs_root *extent_root, 2295int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
2155 struct map_lookup **map_ret,
2156 u64 *num_bytes, u64 *stripe_size,
2157 u64 start, u64 type)
2158{ 2296{
2159 struct btrfs_fs_info *info = extent_root->fs_info; 2297 if (((struct btrfs_device_info *)dev_info1)->max_avail >
2160 struct btrfs_device *device = NULL; 2298 ((struct btrfs_device_info *)dev_info2)->max_avail)
2161 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2299 return -1;
2162 struct list_head *cur; 2300 else if (((struct btrfs_device_info *)dev_info1)->max_avail <
2163 struct map_lookup *map = NULL; 2301 ((struct btrfs_device_info *)dev_info2)->max_avail)
2164 struct extent_map_tree *em_tree; 2302 return 1;
2165 struct extent_map *em; 2303 else
2166 struct list_head private_devs; 2304 return 0;
2167 int min_stripe_size = 1 * 1024 * 1024; 2305}
2168 u64 calc_size = 1024 * 1024 * 1024;
2169 u64 max_chunk_size = calc_size;
2170 u64 min_free;
2171 u64 avail;
2172 u64 max_avail = 0;
2173 u64 dev_offset;
2174 int num_stripes = 1;
2175 int min_stripes = 1;
2176 int sub_stripes = 0;
2177 int looped = 0;
2178 int ret;
2179 int index;
2180 int stripe_len = 64 * 1024;
2181 2306
2182 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2307static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
2183 (type & BTRFS_BLOCK_GROUP_DUP)) { 2308 int *num_stripes, int *min_stripes,
2184 WARN_ON(1); 2309 int *sub_stripes)
2185 type &= ~BTRFS_BLOCK_GROUP_DUP; 2310{
2186 } 2311 *num_stripes = 1;
2187 if (list_empty(&fs_devices->alloc_list)) 2312 *min_stripes = 1;
2188 return -ENOSPC; 2313 *sub_stripes = 0;
2189 2314
2190 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2315 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2191 num_stripes = fs_devices->rw_devices; 2316 *num_stripes = fs_devices->rw_devices;
2192 min_stripes = 2; 2317 *min_stripes = 2;
2193 } 2318 }
2194 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2319 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2195 num_stripes = 2; 2320 *num_stripes = 2;
2196 min_stripes = 2; 2321 *min_stripes = 2;
2197 } 2322 }
2198 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2323 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2199 if (fs_devices->rw_devices < 2) 2324 if (fs_devices->rw_devices < 2)
2200 return -ENOSPC; 2325 return -ENOSPC;
2201 num_stripes = 2; 2326 *num_stripes = 2;
2202 min_stripes = 2; 2327 *min_stripes = 2;
2203 } 2328 }
2204 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2329 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2205 num_stripes = fs_devices->rw_devices; 2330 *num_stripes = fs_devices->rw_devices;
2206 if (num_stripes < 4) 2331 if (*num_stripes < 4)
2207 return -ENOSPC; 2332 return -ENOSPC;
2208 num_stripes &= ~(u32)1; 2333 *num_stripes &= ~(u32)1;
2209 sub_stripes = 2; 2334 *sub_stripes = 2;
2210 min_stripes = 4; 2335 *min_stripes = 4;
2211 } 2336 }
2212 2337
2338 return 0;
2339}
2340
2341static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
2342 u64 proposed_size, u64 type,
2343 int num_stripes, int small_stripe)
2344{
2345 int min_stripe_size = 1 * 1024 * 1024;
2346 u64 calc_size = proposed_size;
2347 u64 max_chunk_size = calc_size;
2348 int ncopies = 1;
2349
2350 if (type & (BTRFS_BLOCK_GROUP_RAID1 |
2351 BTRFS_BLOCK_GROUP_DUP |
2352 BTRFS_BLOCK_GROUP_RAID10))
2353 ncopies = 2;
2354
2213 if (type & BTRFS_BLOCK_GROUP_DATA) { 2355 if (type & BTRFS_BLOCK_GROUP_DATA) {
2214 max_chunk_size = 10 * calc_size; 2356 max_chunk_size = 10 * calc_size;
2215 min_stripe_size = 64 * 1024 * 1024; 2357 min_stripe_size = 64 * 1024 * 1024;
@@ -2226,51 +2368,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2226 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2368 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2227 max_chunk_size); 2369 max_chunk_size);
2228 2370
2229again: 2371 if (calc_size * num_stripes > max_chunk_size * ncopies) {
2230 max_avail = 0; 2372 calc_size = max_chunk_size * ncopies;
2231 if (!map || map->num_stripes != num_stripes) {
2232 kfree(map);
2233 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2234 if (!map)
2235 return -ENOMEM;
2236 map->num_stripes = num_stripes;
2237 }
2238
2239 if (calc_size * num_stripes > max_chunk_size) {
2240 calc_size = max_chunk_size;
2241 do_div(calc_size, num_stripes); 2373 do_div(calc_size, num_stripes);
2242 do_div(calc_size, stripe_len); 2374 do_div(calc_size, BTRFS_STRIPE_LEN);
2243 calc_size *= stripe_len; 2375 calc_size *= BTRFS_STRIPE_LEN;
2244 } 2376 }
2245 2377
2246 /* we don't want tiny stripes */ 2378 /* we don't want tiny stripes */
2247 if (!looped) 2379 if (!small_stripe)
2248 calc_size = max_t(u64, min_stripe_size, calc_size); 2380 calc_size = max_t(u64, min_stripe_size, calc_size);
2249 2381
2250 /* 2382 /*
2251 * we're about to do_div by the stripe_len so lets make sure 2383 * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
2252 * we end up with something bigger than a stripe 2384 * we end up with something bigger than a stripe
2253 */ 2385 */
2254 calc_size = max_t(u64, calc_size, stripe_len * 4); 2386 calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
2387
2388 do_div(calc_size, BTRFS_STRIPE_LEN);
2389 calc_size *= BTRFS_STRIPE_LEN;
2390
2391 return calc_size;
2392}
2393
2394static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
2395 int num_stripes)
2396{
2397 struct map_lookup *new;
2398 size_t len = map_lookup_size(num_stripes);
2399
2400 BUG_ON(map->num_stripes < num_stripes);
2401
2402 if (map->num_stripes == num_stripes)
2403 return map;
2404
2405 new = kmalloc(len, GFP_NOFS);
2406 if (!new) {
2407 /* just change map->num_stripes */
2408 map->num_stripes = num_stripes;
2409 return map;
2410 }
2411
2412 memcpy(new, map, len);
2413 new->num_stripes = num_stripes;
2414 kfree(map);
2415 return new;
2416}
2417
2418/*
2419 * helper to allocate device space from btrfs_device_info, in which we stored
2420 * max free space information of every device. It is used when we can not
2421 * allocate chunks by default size.
2422 *
2423 * By this helper, we can allocate a new chunk as larger as possible.
2424 */
2425static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
2426 struct btrfs_fs_devices *fs_devices,
2427 struct btrfs_device_info *devices,
2428 int nr_device, u64 type,
2429 struct map_lookup **map_lookup,
2430 int min_stripes, u64 *stripe_size)
2431{
2432 int i, index, sort_again = 0;
2433 int min_devices = min_stripes;
2434 u64 max_avail, min_free;
2435 struct map_lookup *map = *map_lookup;
2436 int ret;
2437
2438 if (nr_device < min_stripes)
2439 return -ENOSPC;
2440
2441 btrfs_descending_sort_devices(devices, nr_device);
2442
2443 max_avail = devices[0].max_avail;
2444 if (!max_avail)
2445 return -ENOSPC;
2446
2447 for (i = 0; i < nr_device; i++) {
2448 /*
2449 * if dev_offset = 0, it means the free space of this device
2450 * is less than what we need, and we didn't search max avail
2451 * extent on this device, so do it now.
2452 */
2453 if (!devices[i].dev_offset) {
2454 ret = find_free_dev_extent(trans, devices[i].dev,
2455 max_avail,
2456 &devices[i].dev_offset,
2457 &devices[i].max_avail);
2458 if (ret != 0 && ret != -ENOSPC)
2459 return ret;
2460 sort_again = 1;
2461 }
2462 }
2463
2464 /* we update the max avail free extent of each devices, sort again */
2465 if (sort_again)
2466 btrfs_descending_sort_devices(devices, nr_device);
2467
2468 if (type & BTRFS_BLOCK_GROUP_DUP)
2469 min_devices = 1;
2470
2471 if (!devices[min_devices - 1].max_avail)
2472 return -ENOSPC;
2473
2474 max_avail = devices[min_devices - 1].max_avail;
2475 if (type & BTRFS_BLOCK_GROUP_DUP)
2476 do_div(max_avail, 2);
2477
2478 max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
2479 min_stripes, 1);
2480 if (type & BTRFS_BLOCK_GROUP_DUP)
2481 min_free = max_avail * 2;
2482 else
2483 min_free = max_avail;
2484
2485 if (min_free > devices[min_devices - 1].max_avail)
2486 return -ENOSPC;
2487
2488 map = __shrink_map_lookup_stripes(map, min_stripes);
2489 *stripe_size = max_avail;
2490
2491 index = 0;
2492 for (i = 0; i < min_stripes; i++) {
2493 map->stripes[i].dev = devices[index].dev;
2494 map->stripes[i].physical = devices[index].dev_offset;
2495 if (type & BTRFS_BLOCK_GROUP_DUP) {
2496 i++;
2497 map->stripes[i].dev = devices[index].dev;
2498 map->stripes[i].physical = devices[index].dev_offset +
2499 max_avail;
2500 }
2501 index++;
2502 }
2503 *map_lookup = map;
2504
2505 return 0;
2506}
2255 2507
2256 do_div(calc_size, stripe_len); 2508static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2257 calc_size *= stripe_len; 2509 struct btrfs_root *extent_root,
2510 struct map_lookup **map_ret,
2511 u64 *num_bytes, u64 *stripe_size,
2512 u64 start, u64 type)
2513{
2514 struct btrfs_fs_info *info = extent_root->fs_info;
2515 struct btrfs_device *device = NULL;
2516 struct btrfs_fs_devices *fs_devices = info->fs_devices;
2517 struct list_head *cur;
2518 struct map_lookup *map;
2519 struct extent_map_tree *em_tree;
2520 struct extent_map *em;
2521 struct btrfs_device_info *devices_info;
2522 struct list_head private_devs;
2523 u64 calc_size = 1024 * 1024 * 1024;
2524 u64 min_free;
2525 u64 avail;
2526 u64 dev_offset;
2527 int num_stripes;
2528 int min_stripes;
2529 int sub_stripes;
2530 int min_devices; /* the min number of devices we need */
2531 int i;
2532 int ret;
2533 int index;
2534
2535 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2536 (type & BTRFS_BLOCK_GROUP_DUP)) {
2537 WARN_ON(1);
2538 type &= ~BTRFS_BLOCK_GROUP_DUP;
2539 }
2540 if (list_empty(&fs_devices->alloc_list))
2541 return -ENOSPC;
2542
2543 ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
2544 &min_stripes, &sub_stripes);
2545 if (ret)
2546 return ret;
2547
2548 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2549 GFP_NOFS);
2550 if (!devices_info)
2551 return -ENOMEM;
2552
2553 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2554 if (!map) {
2555 ret = -ENOMEM;
2556 goto error;
2557 }
2558 map->num_stripes = num_stripes;
2258 2559
2259 cur = fs_devices->alloc_list.next; 2560 cur = fs_devices->alloc_list.next;
2260 index = 0; 2561 index = 0;
2562 i = 0;
2261 2563
2262 if (type & BTRFS_BLOCK_GROUP_DUP) 2564 calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
2565 num_stripes, 0);
2566
2567 if (type & BTRFS_BLOCK_GROUP_DUP) {
2263 min_free = calc_size * 2; 2568 min_free = calc_size * 2;
2264 else 2569 min_devices = 1;
2570 } else {
2265 min_free = calc_size; 2571 min_free = calc_size;
2266 2572 min_devices = min_stripes;
2267 /* 2573 }
2268 * we add 1MB because we never use the first 1MB of the device, unless
2269 * we've looped, then we are likely allocating the maximum amount of
2270 * space left already
2271 */
2272 if (!looped)
2273 min_free += 1024 * 1024;
2274 2574
2275 INIT_LIST_HEAD(&private_devs); 2575 INIT_LIST_HEAD(&private_devs);
2276 while (index < num_stripes) { 2576 while (index < num_stripes) {
@@ -2283,27 +2583,39 @@ again:
2283 cur = cur->next; 2583 cur = cur->next;
2284 2584
2285 if (device->in_fs_metadata && avail >= min_free) { 2585 if (device->in_fs_metadata && avail >= min_free) {
2286 ret = find_free_dev_extent(trans, device, 2586 ret = find_free_dev_extent(trans, device, min_free,
2287 min_free, &dev_offset, 2587 &devices_info[i].dev_offset,
2288 &max_avail); 2588 &devices_info[i].max_avail);
2289 if (ret == 0) { 2589 if (ret == 0) {
2290 list_move_tail(&device->dev_alloc_list, 2590 list_move_tail(&device->dev_alloc_list,
2291 &private_devs); 2591 &private_devs);
2292 map->stripes[index].dev = device; 2592 map->stripes[index].dev = device;
2293 map->stripes[index].physical = dev_offset; 2593 map->stripes[index].physical =
2594 devices_info[i].dev_offset;
2294 index++; 2595 index++;
2295 if (type & BTRFS_BLOCK_GROUP_DUP) { 2596 if (type & BTRFS_BLOCK_GROUP_DUP) {
2296 map->stripes[index].dev = device; 2597 map->stripes[index].dev = device;
2297 map->stripes[index].physical = 2598 map->stripes[index].physical =
2298 dev_offset + calc_size; 2599 devices_info[i].dev_offset +
2600 calc_size;
2299 index++; 2601 index++;
2300 } 2602 }
2301 } 2603 } else if (ret != -ENOSPC)
2302 } else if (device->in_fs_metadata && avail > max_avail) 2604 goto error;
2303 max_avail = avail; 2605
2606 devices_info[i].dev = device;
2607 i++;
2608 } else if (device->in_fs_metadata &&
2609 avail >= BTRFS_STRIPE_LEN) {
2610 devices_info[i].dev = device;
2611 devices_info[i].max_avail = avail;
2612 i++;
2613 }
2614
2304 if (cur == &fs_devices->alloc_list) 2615 if (cur == &fs_devices->alloc_list)
2305 break; 2616 break;
2306 } 2617 }
2618
2307 list_splice(&private_devs, &fs_devices->alloc_list); 2619 list_splice(&private_devs, &fs_devices->alloc_list);
2308 if (index < num_stripes) { 2620 if (index < num_stripes) {
2309 if (index >= min_stripes) { 2621 if (index >= min_stripes) {
@@ -2312,34 +2624,36 @@ again:
2312 num_stripes /= sub_stripes; 2624 num_stripes /= sub_stripes;
2313 num_stripes *= sub_stripes; 2625 num_stripes *= sub_stripes;
2314 } 2626 }
2315 looped = 1; 2627
2316 goto again; 2628 map = __shrink_map_lookup_stripes(map, num_stripes);
2317 } 2629 } else if (i >= min_devices) {
2318 if (!looped && max_avail > 0) { 2630 ret = __btrfs_alloc_tiny_space(trans, fs_devices,
2319 looped = 1; 2631 devices_info, i, type,
2320 calc_size = max_avail; 2632 &map, min_stripes,
2321 goto again; 2633 &calc_size);
2634 if (ret)
2635 goto error;
2636 } else {
2637 ret = -ENOSPC;
2638 goto error;
2322 } 2639 }
2323 kfree(map);
2324 return -ENOSPC;
2325 } 2640 }
2326 map->sector_size = extent_root->sectorsize; 2641 map->sector_size = extent_root->sectorsize;
2327 map->stripe_len = stripe_len; 2642 map->stripe_len = BTRFS_STRIPE_LEN;
2328 map->io_align = stripe_len; 2643 map->io_align = BTRFS_STRIPE_LEN;
2329 map->io_width = stripe_len; 2644 map->io_width = BTRFS_STRIPE_LEN;
2330 map->type = type; 2645 map->type = type;
2331 map->num_stripes = num_stripes;
2332 map->sub_stripes = sub_stripes; 2646 map->sub_stripes = sub_stripes;
2333 2647
2334 *map_ret = map; 2648 *map_ret = map;
2335 *stripe_size = calc_size; 2649 *stripe_size = calc_size;
2336 *num_bytes = chunk_bytes_by_type(type, calc_size, 2650 *num_bytes = chunk_bytes_by_type(type, calc_size,
2337 num_stripes, sub_stripes); 2651 map->num_stripes, sub_stripes);
2338 2652
2339 em = alloc_extent_map(GFP_NOFS); 2653 em = alloc_extent_map(GFP_NOFS);
2340 if (!em) { 2654 if (!em) {
2341 kfree(map); 2655 ret = -ENOMEM;
2342 return -ENOMEM; 2656 goto error;
2343 } 2657 }
2344 em->bdev = (struct block_device *)map; 2658 em->bdev = (struct block_device *)map;
2345 em->start = start; 2659 em->start = start;
@@ -2372,7 +2686,13 @@ again:
2372 index++; 2686 index++;
2373 } 2687 }
2374 2688
2689 kfree(devices_info);
2375 return 0; 2690 return 0;
2691
2692error:
2693 kfree(map);
2694 kfree(devices_info);
2695 return ret;
2376} 2696}
2377 2697
2378static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2698static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,