aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/volumes.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r--fs/btrfs/volumes.c1094
1 files changed, 677 insertions, 417 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd318ff280b2..19450bc53632 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include "compat.h" 27#include "compat.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -32,38 +33,14 @@
32#include "volumes.h" 33#include "volumes.h"
33#include "async-thread.h" 34#include "async-thread.h"
34 35
35struct map_lookup {
36 u64 type;
37 int io_align;
38 int io_width;
39 int stripe_len;
40 int sector_size;
41 int num_stripes;
42 int sub_stripes;
43 struct btrfs_bio_stripe stripes[];
44};
45
46static int init_first_rw_device(struct btrfs_trans_handle *trans, 36static int init_first_rw_device(struct btrfs_trans_handle *trans,
47 struct btrfs_root *root, 37 struct btrfs_root *root,
48 struct btrfs_device *device); 38 struct btrfs_device *device);
49static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 39static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
50 40
51#define map_lookup_size(n) (sizeof(struct map_lookup) + \
52 (sizeof(struct btrfs_bio_stripe) * (n)))
53
54static DEFINE_MUTEX(uuid_mutex); 41static DEFINE_MUTEX(uuid_mutex);
55static LIST_HEAD(fs_uuids); 42static LIST_HEAD(fs_uuids);
56 43
57void btrfs_lock_volumes(void)
58{
59 mutex_lock(&uuid_mutex);
60}
61
62void btrfs_unlock_volumes(void)
63{
64 mutex_unlock(&uuid_mutex);
65}
66
67static void lock_chunks(struct btrfs_root *root) 44static void lock_chunks(struct btrfs_root *root)
68{ 45{
69 mutex_lock(&root->fs_info->chunk_mutex); 46 mutex_lock(&root->fs_info->chunk_mutex);
@@ -161,22 +138,25 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
161 struct bio *cur; 138 struct bio *cur;
162 int again = 0; 139 int again = 0;
163 unsigned long num_run; 140 unsigned long num_run;
164 unsigned long num_sync_run;
165 unsigned long batch_run = 0; 141 unsigned long batch_run = 0;
166 unsigned long limit; 142 unsigned long limit;
167 unsigned long last_waited = 0; 143 unsigned long last_waited = 0;
168 int force_reg = 0; 144 int force_reg = 0;
145 struct blk_plug plug;
146
147 /*
148 * this function runs all the bios we've collected for
149 * a particular device. We don't want to wander off to
150 * another device without first sending all of these down.
151 * So, setup a plug here and finish it off before we return
152 */
153 blk_start_plug(&plug);
169 154
170 bdi = blk_get_backing_dev_info(device->bdev); 155 bdi = blk_get_backing_dev_info(device->bdev);
171 fs_info = device->dev_root->fs_info; 156 fs_info = device->dev_root->fs_info;
172 limit = btrfs_async_submit_limit(fs_info); 157 limit = btrfs_async_submit_limit(fs_info);
173 limit = limit * 2 / 3; 158 limit = limit * 2 / 3;
174 159
175 /* we want to make sure that every time we switch from the sync
176 * list to the normal list, we unplug
177 */
178 num_sync_run = 0;
179
180loop: 160loop:
181 spin_lock(&device->io_lock); 161 spin_lock(&device->io_lock);
182 162
@@ -222,15 +202,6 @@ loop_lock:
222 202
223 spin_unlock(&device->io_lock); 203 spin_unlock(&device->io_lock);
224 204
225 /*
226 * if we're doing the regular priority list, make sure we unplug
227 * for any high prio bios we've sent down
228 */
229 if (pending_bios == &device->pending_bios && num_sync_run > 0) {
230 num_sync_run = 0;
231 blk_run_backing_dev(bdi, NULL);
232 }
233
234 while (pending) { 205 while (pending) {
235 206
236 rmb(); 207 rmb();
@@ -258,19 +229,11 @@ loop_lock:
258 229
259 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 230 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
260 231
261 if (cur->bi_rw & REQ_SYNC)
262 num_sync_run++;
263
264 submit_bio(cur->bi_rw, cur); 232 submit_bio(cur->bi_rw, cur);
265 num_run++; 233 num_run++;
266 batch_run++; 234 batch_run++;
267 if (need_resched()) { 235 if (need_resched())
268 if (num_sync_run) {
269 blk_run_backing_dev(bdi, NULL);
270 num_sync_run = 0;
271 }
272 cond_resched(); 236 cond_resched();
273 }
274 237
275 /* 238 /*
276 * we made progress, there is more work to do and the bdi 239 * we made progress, there is more work to do and the bdi
@@ -303,13 +266,8 @@ loop_lock:
303 * against it before looping 266 * against it before looping
304 */ 267 */
305 last_waited = ioc->last_waited; 268 last_waited = ioc->last_waited;
306 if (need_resched()) { 269 if (need_resched())
307 if (num_sync_run) {
308 blk_run_backing_dev(bdi, NULL);
309 num_sync_run = 0;
310 }
311 cond_resched(); 270 cond_resched();
312 }
313 continue; 271 continue;
314 } 272 }
315 spin_lock(&device->io_lock); 273 spin_lock(&device->io_lock);
@@ -322,22 +280,6 @@ loop_lock:
322 } 280 }
323 } 281 }
324 282
325 if (num_sync_run) {
326 num_sync_run = 0;
327 blk_run_backing_dev(bdi, NULL);
328 }
329 /*
330 * IO has already been through a long path to get here. Checksumming,
331 * async helper threads, perhaps compression. We've done a pretty
332 * good job of collecting a batch of IO and should just unplug
333 * the device right away.
334 *
335 * This will help anyone who is waiting on the IO, they might have
336 * already unplugged, but managed to do so before the bio they
337 * cared about found its way down here.
338 */
339 blk_run_backing_dev(bdi, NULL);
340
341 cond_resched(); 283 cond_resched();
342 if (again) 284 if (again)
343 goto loop; 285 goto loop;
@@ -348,6 +290,7 @@ loop_lock:
348 spin_unlock(&device->io_lock); 290 spin_unlock(&device->io_lock);
349 291
350done: 292done:
293 blk_finish_plug(&plug);
351 return 0; 294 return 0;
352} 295}
353 296
@@ -398,7 +341,6 @@ static noinline int device_list_add(const char *path,
398 device->work.func = pending_bios_fn; 341 device->work.func = pending_bios_fn;
399 memcpy(device->uuid, disk_super->dev_item.uuid, 342 memcpy(device->uuid, disk_super->dev_item.uuid,
400 BTRFS_UUID_SIZE); 343 BTRFS_UUID_SIZE);
401 device->barriers = 1;
402 spin_lock_init(&device->io_lock); 344 spin_lock_init(&device->io_lock);
403 device->name = kstrdup(path, GFP_NOFS); 345 device->name = kstrdup(path, GFP_NOFS);
404 if (!device->name) { 346 if (!device->name) {
@@ -408,17 +350,21 @@ static noinline int device_list_add(const char *path,
408 INIT_LIST_HEAD(&device->dev_alloc_list); 350 INIT_LIST_HEAD(&device->dev_alloc_list);
409 351
410 mutex_lock(&fs_devices->device_list_mutex); 352 mutex_lock(&fs_devices->device_list_mutex);
411 list_add(&device->dev_list, &fs_devices->devices); 353 list_add_rcu(&device->dev_list, &fs_devices->devices);
412 mutex_unlock(&fs_devices->device_list_mutex); 354 mutex_unlock(&fs_devices->device_list_mutex);
413 355
414 device->fs_devices = fs_devices; 356 device->fs_devices = fs_devices;
415 fs_devices->num_devices++; 357 fs_devices->num_devices++;
416 } else if (strcmp(device->name, path)) { 358 } else if (!device->name || strcmp(device->name, path)) {
417 name = kstrdup(path, GFP_NOFS); 359 name = kstrdup(path, GFP_NOFS);
418 if (!name) 360 if (!name)
419 return -ENOMEM; 361 return -ENOMEM;
420 kfree(device->name); 362 kfree(device->name);
421 device->name = name; 363 device->name = name;
364 if (device->missing) {
365 fs_devices->missing_devices--;
366 device->missing = 0;
367 }
422 } 368 }
423 369
424 if (found_transid > fs_devices->latest_trans) { 370 if (found_transid > fs_devices->latest_trans) {
@@ -447,7 +393,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
447 fs_devices->latest_trans = orig->latest_trans; 393 fs_devices->latest_trans = orig->latest_trans;
448 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 394 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
449 395
450 mutex_lock(&orig->device_list_mutex); 396 /* We have held the volume lock, it is safe to get the devices. */
451 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 397 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
452 device = kzalloc(sizeof(*device), GFP_NOFS); 398 device = kzalloc(sizeof(*device), GFP_NOFS);
453 if (!device) 399 if (!device)
@@ -462,7 +408,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
462 device->devid = orig_dev->devid; 408 device->devid = orig_dev->devid;
463 device->work.func = pending_bios_fn; 409 device->work.func = pending_bios_fn;
464 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 410 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
465 device->barriers = 1;
466 spin_lock_init(&device->io_lock); 411 spin_lock_init(&device->io_lock);
467 INIT_LIST_HEAD(&device->dev_list); 412 INIT_LIST_HEAD(&device->dev_list);
468 INIT_LIST_HEAD(&device->dev_alloc_list); 413 INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -471,10 +416,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
471 device->fs_devices = fs_devices; 416 device->fs_devices = fs_devices;
472 fs_devices->num_devices++; 417 fs_devices->num_devices++;
473 } 418 }
474 mutex_unlock(&orig->device_list_mutex);
475 return fs_devices; 419 return fs_devices;
476error: 420error:
477 mutex_unlock(&orig->device_list_mutex);
478 free_fs_devices(fs_devices); 421 free_fs_devices(fs_devices);
479 return ERR_PTR(-ENOMEM); 422 return ERR_PTR(-ENOMEM);
480} 423}
@@ -485,13 +428,13 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
485 428
486 mutex_lock(&uuid_mutex); 429 mutex_lock(&uuid_mutex);
487again: 430again:
488 mutex_lock(&fs_devices->device_list_mutex); 431 /* This is the initialized path, it is safe to release the devices. */
489 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 432 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
490 if (device->in_fs_metadata) 433 if (device->in_fs_metadata)
491 continue; 434 continue;
492 435
493 if (device->bdev) { 436 if (device->bdev) {
494 close_bdev_exclusive(device->bdev, device->mode); 437 blkdev_put(device->bdev, device->mode);
495 device->bdev = NULL; 438 device->bdev = NULL;
496 fs_devices->open_devices--; 439 fs_devices->open_devices--;
497 } 440 }
@@ -505,7 +448,6 @@ again:
505 kfree(device->name); 448 kfree(device->name);
506 kfree(device); 449 kfree(device);
507 } 450 }
508 mutex_unlock(&fs_devices->device_list_mutex);
509 451
510 if (fs_devices->seed) { 452 if (fs_devices->seed) {
511 fs_devices = fs_devices->seed; 453 fs_devices = fs_devices->seed;
@@ -516,6 +458,29 @@ again:
516 return 0; 458 return 0;
517} 459}
518 460
461static void __free_device(struct work_struct *work)
462{
463 struct btrfs_device *device;
464
465 device = container_of(work, struct btrfs_device, rcu_work);
466
467 if (device->bdev)
468 blkdev_put(device->bdev, device->mode);
469
470 kfree(device->name);
471 kfree(device);
472}
473
474static void free_device(struct rcu_head *head)
475{
476 struct btrfs_device *device;
477
478 device = container_of(head, struct btrfs_device, rcu);
479
480 INIT_WORK(&device->rcu_work, __free_device);
481 schedule_work(&device->rcu_work);
482}
483
519static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 484static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
520{ 485{
521 struct btrfs_device *device; 486 struct btrfs_device *device;
@@ -523,20 +488,32 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
523 if (--fs_devices->opened > 0) 488 if (--fs_devices->opened > 0)
524 return 0; 489 return 0;
525 490
491 mutex_lock(&fs_devices->device_list_mutex);
526 list_for_each_entry(device, &fs_devices->devices, dev_list) { 492 list_for_each_entry(device, &fs_devices->devices, dev_list) {
527 if (device->bdev) { 493 struct btrfs_device *new_device;
528 close_bdev_exclusive(device->bdev, device->mode); 494
495 if (device->bdev)
529 fs_devices->open_devices--; 496 fs_devices->open_devices--;
530 } 497
531 if (device->writeable) { 498 if (device->writeable) {
532 list_del_init(&device->dev_alloc_list); 499 list_del_init(&device->dev_alloc_list);
533 fs_devices->rw_devices--; 500 fs_devices->rw_devices--;
534 } 501 }
535 502
536 device->bdev = NULL; 503 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
537 device->writeable = 0; 504 BUG_ON(!new_device);
538 device->in_fs_metadata = 0; 505 memcpy(new_device, device, sizeof(*new_device));
506 new_device->name = kstrdup(device->name, GFP_NOFS);
507 BUG_ON(device->name && !new_device->name);
508 new_device->bdev = NULL;
509 new_device->writeable = 0;
510 new_device->in_fs_metadata = 0;
511 list_replace_rcu(&device->dev_list, &new_device->dev_list);
512
513 call_rcu(&device->rcu, free_device);
539 } 514 }
515 mutex_unlock(&fs_devices->device_list_mutex);
516
540 WARN_ON(fs_devices->open_devices); 517 WARN_ON(fs_devices->open_devices);
541 WARN_ON(fs_devices->rw_devices); 518 WARN_ON(fs_devices->rw_devices);
542 fs_devices->opened = 0; 519 fs_devices->opened = 0;
@@ -582,13 +559,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
582 int seeding = 1; 559 int seeding = 1;
583 int ret = 0; 560 int ret = 0;
584 561
562 flags |= FMODE_EXCL;
563
585 list_for_each_entry(device, head, dev_list) { 564 list_for_each_entry(device, head, dev_list) {
586 if (device->bdev) 565 if (device->bdev)
587 continue; 566 continue;
588 if (!device->name) 567 if (!device->name)
589 continue; 568 continue;
590 569
591 bdev = open_bdev_exclusive(device->name, flags, holder); 570 bdev = blkdev_get_by_path(device->name, flags, holder);
592 if (IS_ERR(bdev)) { 571 if (IS_ERR(bdev)) {
593 printk(KERN_INFO "open %s failed\n", device->name); 572 printk(KERN_INFO "open %s failed\n", device->name);
594 goto error; 573 goto error;
@@ -596,8 +575,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
596 set_blocksize(bdev, 4096); 575 set_blocksize(bdev, 4096);
597 576
598 bh = btrfs_read_dev_super(bdev); 577 bh = btrfs_read_dev_super(bdev);
599 if (!bh) 578 if (!bh) {
579 ret = -EINVAL;
600 goto error_close; 580 goto error_close;
581 }
601 582
602 disk_super = (struct btrfs_super_block *)bh->b_data; 583 disk_super = (struct btrfs_super_block *)bh->b_data;
603 devid = btrfs_stack_device_id(&disk_super->dev_item); 584 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -635,12 +616,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
635 list_add(&device->dev_alloc_list, 616 list_add(&device->dev_alloc_list,
636 &fs_devices->alloc_list); 617 &fs_devices->alloc_list);
637 } 618 }
619 brelse(bh);
638 continue; 620 continue;
639 621
640error_brelse: 622error_brelse:
641 brelse(bh); 623 brelse(bh);
642error_close: 624error_close:
643 close_bdev_exclusive(bdev, FMODE_READ); 625 blkdev_put(bdev, flags);
644error: 626error:
645 continue; 627 continue;
646 } 628 }
@@ -686,7 +668,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
686 668
687 mutex_lock(&uuid_mutex); 669 mutex_lock(&uuid_mutex);
688 670
689 bdev = open_bdev_exclusive(path, flags, holder); 671 flags |= FMODE_EXCL;
672 bdev = blkdev_get_by_path(path, flags, holder);
690 673
691 if (IS_ERR(bdev)) { 674 if (IS_ERR(bdev)) {
692 ret = PTR_ERR(bdev); 675 ret = PTR_ERR(bdev);
@@ -698,7 +681,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
698 goto error_close; 681 goto error_close;
699 bh = btrfs_read_dev_super(bdev); 682 bh = btrfs_read_dev_super(bdev);
700 if (!bh) { 683 if (!bh) {
701 ret = -EIO; 684 ret = -EINVAL;
702 goto error_close; 685 goto error_close;
703 } 686 }
704 disk_super = (struct btrfs_super_block *)bh->b_data; 687 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -706,77 +689,178 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
706 transid = btrfs_super_generation(disk_super); 689 transid = btrfs_super_generation(disk_super);
707 if (disk_super->label[0]) 690 if (disk_super->label[0])
708 printk(KERN_INFO "device label %s ", disk_super->label); 691 printk(KERN_INFO "device label %s ", disk_super->label);
709 else { 692 else
710 /* FIXME, make a readl uuid parser */ 693 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
711 printk(KERN_INFO "device fsid %llx-%llx ",
712 *(unsigned long long *)disk_super->fsid,
713 *(unsigned long long *)(disk_super->fsid + 8));
714 }
715 printk(KERN_CONT "devid %llu transid %llu %s\n", 694 printk(KERN_CONT "devid %llu transid %llu %s\n",
716 (unsigned long long)devid, (unsigned long long)transid, path); 695 (unsigned long long)devid, (unsigned long long)transid, path);
717 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 696 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
718 697
719 brelse(bh); 698 brelse(bh);
720error_close: 699error_close:
721 close_bdev_exclusive(bdev, flags); 700 blkdev_put(bdev, flags);
722error: 701error:
723 mutex_unlock(&uuid_mutex); 702 mutex_unlock(&uuid_mutex);
724 return ret; 703 return ret;
725} 704}
726 705
706/* helper to account the used device space in the range */
707int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
708 u64 end, u64 *length)
709{
710 struct btrfs_key key;
711 struct btrfs_root *root = device->dev_root;
712 struct btrfs_dev_extent *dev_extent;
713 struct btrfs_path *path;
714 u64 extent_end;
715 int ret;
716 int slot;
717 struct extent_buffer *l;
718
719 *length = 0;
720
721 if (start >= device->total_bytes)
722 return 0;
723
724 path = btrfs_alloc_path();
725 if (!path)
726 return -ENOMEM;
727 path->reada = 2;
728
729 key.objectid = device->devid;
730 key.offset = start;
731 key.type = BTRFS_DEV_EXTENT_KEY;
732
733 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
734 if (ret < 0)
735 goto out;
736 if (ret > 0) {
737 ret = btrfs_previous_item(root, path, key.objectid, key.type);
738 if (ret < 0)
739 goto out;
740 }
741
742 while (1) {
743 l = path->nodes[0];
744 slot = path->slots[0];
745 if (slot >= btrfs_header_nritems(l)) {
746 ret = btrfs_next_leaf(root, path);
747 if (ret == 0)
748 continue;
749 if (ret < 0)
750 goto out;
751
752 break;
753 }
754 btrfs_item_key_to_cpu(l, &key, slot);
755
756 if (key.objectid < device->devid)
757 goto next;
758
759 if (key.objectid > device->devid)
760 break;
761
762 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
763 goto next;
764
765 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
766 extent_end = key.offset + btrfs_dev_extent_length(l,
767 dev_extent);
768 if (key.offset <= start && extent_end > end) {
769 *length = end - start + 1;
770 break;
771 } else if (key.offset <= start && extent_end > start)
772 *length += extent_end - start;
773 else if (key.offset > start && extent_end <= end)
774 *length += extent_end - key.offset;
775 else if (key.offset > start && key.offset <= end) {
776 *length += end - key.offset + 1;
777 break;
778 } else if (key.offset > end)
779 break;
780
781next:
782 path->slots[0]++;
783 }
784 ret = 0;
785out:
786 btrfs_free_path(path);
787 return ret;
788}
789
727/* 790/*
791 * find_free_dev_extent - find free space in the specified device
792 * @trans: transaction handler
793 * @device: the device which we search the free space in
794 * @num_bytes: the size of the free space that we need
795 * @start: store the start of the free space.
796 * @len: the size of the free space. that we find, or the size of the max
797 * free space if we don't find suitable free space
798 *
728 * this uses a pretty simple search, the expectation is that it is 799 * this uses a pretty simple search, the expectation is that it is
729 * called very infrequently and that a given device has a small number 800 * called very infrequently and that a given device has a small number
730 * of extents 801 * of extents
802 *
803 * @start is used to store the start of the free space if we find. But if we
804 * don't find suitable free space, it will be used to store the start position
805 * of the max free space.
806 *
807 * @len is used to store the size of the free space that we find.
808 * But if we don't find suitable free space, it is used to store the size of
809 * the max free space.
731 */ 810 */
732int find_free_dev_extent(struct btrfs_trans_handle *trans, 811int find_free_dev_extent(struct btrfs_trans_handle *trans,
733 struct btrfs_device *device, u64 num_bytes, 812 struct btrfs_device *device, u64 num_bytes,
734 u64 *start, u64 *max_avail) 813 u64 *start, u64 *len)
735{ 814{
736 struct btrfs_key key; 815 struct btrfs_key key;
737 struct btrfs_root *root = device->dev_root; 816 struct btrfs_root *root = device->dev_root;
738 struct btrfs_dev_extent *dev_extent = NULL; 817 struct btrfs_dev_extent *dev_extent;
739 struct btrfs_path *path; 818 struct btrfs_path *path;
740 u64 hole_size = 0; 819 u64 hole_size;
741 u64 last_byte = 0; 820 u64 max_hole_start;
742 u64 search_start = 0; 821 u64 max_hole_size;
822 u64 extent_end;
823 u64 search_start;
743 u64 search_end = device->total_bytes; 824 u64 search_end = device->total_bytes;
744 int ret; 825 int ret;
745 int slot = 0; 826 int slot;
746 int start_found;
747 struct extent_buffer *l; 827 struct extent_buffer *l;
748 828
749 path = btrfs_alloc_path();
750 if (!path)
751 return -ENOMEM;
752 path->reada = 2;
753 start_found = 0;
754
755 /* FIXME use last free of some kind */ 829 /* FIXME use last free of some kind */
756 830
757 /* we don't want to overwrite the superblock on the drive, 831 /* we don't want to overwrite the superblock on the drive,
758 * so we make sure to start at an offset of at least 1MB 832 * so we make sure to start at an offset of at least 1MB
759 */ 833 */
760 search_start = max((u64)1024 * 1024, search_start); 834 search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
835
836 max_hole_start = search_start;
837 max_hole_size = 0;
761 838
762 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 839 if (search_start >= search_end) {
763 search_start = max(root->fs_info->alloc_start, search_start); 840 ret = -ENOSPC;
841 goto error;
842 }
843
844 path = btrfs_alloc_path();
845 if (!path) {
846 ret = -ENOMEM;
847 goto error;
848 }
849 path->reada = 2;
764 850
765 key.objectid = device->devid; 851 key.objectid = device->devid;
766 key.offset = search_start; 852 key.offset = search_start;
767 key.type = BTRFS_DEV_EXTENT_KEY; 853 key.type = BTRFS_DEV_EXTENT_KEY;
854
768 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 855 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
769 if (ret < 0) 856 if (ret < 0)
770 goto error; 857 goto out;
771 if (ret > 0) { 858 if (ret > 0) {
772 ret = btrfs_previous_item(root, path, key.objectid, key.type); 859 ret = btrfs_previous_item(root, path, key.objectid, key.type);
773 if (ret < 0) 860 if (ret < 0)
774 goto error; 861 goto out;
775 if (ret > 0)
776 start_found = 1;
777 } 862 }
778 l = path->nodes[0]; 863
779 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
780 while (1) { 864 while (1) {
781 l = path->nodes[0]; 865 l = path->nodes[0];
782 slot = path->slots[0]; 866 slot = path->slots[0];
@@ -785,24 +869,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
785 if (ret == 0) 869 if (ret == 0)
786 continue; 870 continue;
787 if (ret < 0) 871 if (ret < 0)
788 goto error; 872 goto out;
789no_more_items: 873
790 if (!start_found) { 874 break;
791 if (search_start >= search_end) {
792 ret = -ENOSPC;
793 goto error;
794 }
795 *start = search_start;
796 start_found = 1;
797 goto check_pending;
798 }
799 *start = last_byte > search_start ?
800 last_byte : search_start;
801 if (search_end <= *start) {
802 ret = -ENOSPC;
803 goto error;
804 }
805 goto check_pending;
806 } 875 }
807 btrfs_item_key_to_cpu(l, &key, slot); 876 btrfs_item_key_to_cpu(l, &key, slot);
808 877
@@ -810,48 +879,62 @@ no_more_items:
810 goto next; 879 goto next;
811 880
812 if (key.objectid > device->devid) 881 if (key.objectid > device->devid)
813 goto no_more_items; 882 break;
814 883
815 if (key.offset >= search_start && key.offset > last_byte && 884 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
816 start_found) { 885 goto next;
817 if (last_byte < search_start)
818 last_byte = search_start;
819 hole_size = key.offset - last_byte;
820 886
821 if (hole_size > *max_avail) 887 if (key.offset > search_start) {
822 *max_avail = hole_size; 888 hole_size = key.offset - search_start;
889
890 if (hole_size > max_hole_size) {
891 max_hole_start = search_start;
892 max_hole_size = hole_size;
893 }
823 894
824 if (key.offset > last_byte && 895 /*
825 hole_size >= num_bytes) { 896 * If this free space is greater than which we need,
826 *start = last_byte; 897 * it must be the max free space that we have found
827 goto check_pending; 898 * until now, so max_hole_start must point to the start
899 * of this free space and the length of this free space
900 * is stored in max_hole_size. Thus, we return
901 * max_hole_start and max_hole_size and go back to the
902 * caller.
903 */
904 if (hole_size >= num_bytes) {
905 ret = 0;
906 goto out;
828 } 907 }
829 } 908 }
830 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
831 goto next;
832 909
833 start_found = 1;
834 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 910 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
835 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 911 extent_end = key.offset + btrfs_dev_extent_length(l,
912 dev_extent);
913 if (extent_end > search_start)
914 search_start = extent_end;
836next: 915next:
837 path->slots[0]++; 916 path->slots[0]++;
838 cond_resched(); 917 cond_resched();
839 } 918 }
840check_pending:
841 /* we have to make sure we didn't find an extent that has already
842 * been allocated by the map tree or the original allocation
843 */
844 BUG_ON(*start < search_start);
845 919
846 if (*start + num_bytes > search_end) { 920 hole_size = search_end- search_start;
847 ret = -ENOSPC; 921 if (hole_size > max_hole_size) {
848 goto error; 922 max_hole_start = search_start;
923 max_hole_size = hole_size;
849 } 924 }
850 /* check for pending inserts here */
851 ret = 0;
852 925
853error: 926 /* See above. */
927 if (hole_size < num_bytes)
928 ret = -ENOSPC;
929 else
930 ret = 0;
931
932out:
854 btrfs_free_path(path); 933 btrfs_free_path(path);
934error:
935 *start = max_hole_start;
936 if (len)
937 *len = max_hole_size;
855 return ret; 938 return ret;
856} 939}
857 940
@@ -879,14 +962,14 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
879 if (ret > 0) { 962 if (ret > 0) {
880 ret = btrfs_previous_item(root, path, key.objectid, 963 ret = btrfs_previous_item(root, path, key.objectid,
881 BTRFS_DEV_EXTENT_KEY); 964 BTRFS_DEV_EXTENT_KEY);
882 BUG_ON(ret); 965 if (ret)
966 goto out;
883 leaf = path->nodes[0]; 967 leaf = path->nodes[0];
884 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 968 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
885 extent = btrfs_item_ptr(leaf, path->slots[0], 969 extent = btrfs_item_ptr(leaf, path->slots[0],
886 struct btrfs_dev_extent); 970 struct btrfs_dev_extent);
887 BUG_ON(found_key.offset > start || found_key.offset + 971 BUG_ON(found_key.offset > start || found_key.offset +
888 btrfs_dev_extent_length(leaf, extent) < start); 972 btrfs_dev_extent_length(leaf, extent) < start);
889 ret = 0;
890 } else if (ret == 0) { 973 } else if (ret == 0) {
891 leaf = path->nodes[0]; 974 leaf = path->nodes[0];
892 extent = btrfs_item_ptr(leaf, path->slots[0], 975 extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -897,8 +980,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
897 if (device->bytes_used > 0) 980 if (device->bytes_used > 0)
898 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 981 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
899 ret = btrfs_del_item(trans, root, path); 982 ret = btrfs_del_item(trans, root, path);
900 BUG_ON(ret);
901 983
984out:
902 btrfs_free_path(path); 985 btrfs_free_path(path);
903 return ret; 986 return ret;
904} 987}
@@ -1098,6 +1181,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1098 return -ENOMEM; 1181 return -ENOMEM;
1099 1182
1100 trans = btrfs_start_transaction(root, 0); 1183 trans = btrfs_start_transaction(root, 0);
1184 if (IS_ERR(trans)) {
1185 btrfs_free_path(path);
1186 return PTR_ERR(trans);
1187 }
1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1188 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1102 key.type = BTRFS_DEV_ITEM_KEY; 1189 key.type = BTRFS_DEV_ITEM_KEY;
1103 key.offset = device->devid; 1190 key.offset = device->devid;
@@ -1129,11 +1216,13 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1129 struct block_device *bdev; 1216 struct block_device *bdev;
1130 struct buffer_head *bh = NULL; 1217 struct buffer_head *bh = NULL;
1131 struct btrfs_super_block *disk_super; 1218 struct btrfs_super_block *disk_super;
1219 struct btrfs_fs_devices *cur_devices;
1132 u64 all_avail; 1220 u64 all_avail;
1133 u64 devid; 1221 u64 devid;
1134 u64 num_devices; 1222 u64 num_devices;
1135 u8 *dev_uuid; 1223 u8 *dev_uuid;
1136 int ret = 0; 1224 int ret = 0;
1225 bool clear_super = false;
1137 1226
1138 mutex_lock(&uuid_mutex); 1227 mutex_lock(&uuid_mutex);
1139 mutex_lock(&root->fs_info->volume_mutex); 1228 mutex_lock(&root->fs_info->volume_mutex);
@@ -1164,14 +1253,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1164 1253
1165 device = NULL; 1254 device = NULL;
1166 devices = &root->fs_info->fs_devices->devices; 1255 devices = &root->fs_info->fs_devices->devices;
1167 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1256 /*
1257 * It is safe to read the devices since the volume_mutex
1258 * is held.
1259 */
1168 list_for_each_entry(tmp, devices, dev_list) { 1260 list_for_each_entry(tmp, devices, dev_list) {
1169 if (tmp->in_fs_metadata && !tmp->bdev) { 1261 if (tmp->in_fs_metadata && !tmp->bdev) {
1170 device = tmp; 1262 device = tmp;
1171 break; 1263 break;
1172 } 1264 }
1173 } 1265 }
1174 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1175 bdev = NULL; 1266 bdev = NULL;
1176 bh = NULL; 1267 bh = NULL;
1177 disk_super = NULL; 1268 disk_super = NULL;
@@ -1181,8 +1272,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1181 goto out; 1272 goto out;
1182 } 1273 }
1183 } else { 1274 } else {
1184 bdev = open_bdev_exclusive(device_path, FMODE_READ, 1275 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1185 root->fs_info->bdev_holder); 1276 root->fs_info->bdev_holder);
1186 if (IS_ERR(bdev)) { 1277 if (IS_ERR(bdev)) {
1187 ret = PTR_ERR(bdev); 1278 ret = PTR_ERR(bdev);
1188 goto out; 1279 goto out;
@@ -1191,7 +1282,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1191 set_blocksize(bdev, 4096); 1282 set_blocksize(bdev, 4096);
1192 bh = btrfs_read_dev_super(bdev); 1283 bh = btrfs_read_dev_super(bdev);
1193 if (!bh) { 1284 if (!bh) {
1194 ret = -EIO; 1285 ret = -EINVAL;
1195 goto error_close; 1286 goto error_close;
1196 } 1287 }
1197 disk_super = (struct btrfs_super_block *)bh->b_data; 1288 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1213,31 +1304,39 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1213 } 1304 }
1214 1305
1215 if (device->writeable) { 1306 if (device->writeable) {
1307 lock_chunks(root);
1216 list_del_init(&device->dev_alloc_list); 1308 list_del_init(&device->dev_alloc_list);
1309 unlock_chunks(root);
1217 root->fs_info->fs_devices->rw_devices--; 1310 root->fs_info->fs_devices->rw_devices--;
1311 clear_super = true;
1218 } 1312 }
1219 1313
1220 ret = btrfs_shrink_device(device, 0); 1314 ret = btrfs_shrink_device(device, 0);
1221 if (ret) 1315 if (ret)
1222 goto error_brelse; 1316 goto error_undo;
1223 1317
1224 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1318 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1225 if (ret) 1319 if (ret)
1226 goto error_brelse; 1320 goto error_undo;
1227 1321
1228 device->in_fs_metadata = 0; 1322 device->in_fs_metadata = 0;
1323 btrfs_scrub_cancel_dev(root, device);
1229 1324
1230 /* 1325 /*
1231 * the device list mutex makes sure that we don't change 1326 * the device list mutex makes sure that we don't change
1232 * the device list while someone else is writing out all 1327 * the device list while someone else is writing out all
1233 * the device supers. 1328 * the device supers.
1234 */ 1329 */
1330
1331 cur_devices = device->fs_devices;
1235 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1332 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1236 list_del_init(&device->dev_list); 1333 list_del_rcu(&device->dev_list);
1237 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1238 1334
1239 device->fs_devices->num_devices--; 1335 device->fs_devices->num_devices--;
1240 1336
1337 if (device->missing)
1338 root->fs_info->fs_devices->missing_devices--;
1339
1241 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1340 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1242 struct btrfs_device, dev_list); 1341 struct btrfs_device, dev_list);
1243 if (device->bdev == root->fs_info->sb->s_bdev) 1342 if (device->bdev == root->fs_info->sb->s_bdev)
@@ -1245,34 +1344,36 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1245 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1344 if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1246 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1345 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1247 1346
1248 if (device->bdev) { 1347 if (device->bdev)
1249 close_bdev_exclusive(device->bdev, device->mode);
1250 device->bdev = NULL;
1251 device->fs_devices->open_devices--; 1348 device->fs_devices->open_devices--;
1252 } 1349
1350 call_rcu(&device->rcu, free_device);
1351 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1253 1352
1254 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1353 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1255 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1354 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1256 1355
1257 if (device->fs_devices->open_devices == 0) { 1356 if (cur_devices->open_devices == 0) {
1258 struct btrfs_fs_devices *fs_devices; 1357 struct btrfs_fs_devices *fs_devices;
1259 fs_devices = root->fs_info->fs_devices; 1358 fs_devices = root->fs_info->fs_devices;
1260 while (fs_devices) { 1359 while (fs_devices) {
1261 if (fs_devices->seed == device->fs_devices) 1360 if (fs_devices->seed == cur_devices)
1262 break; 1361 break;
1263 fs_devices = fs_devices->seed; 1362 fs_devices = fs_devices->seed;
1264 } 1363 }
1265 fs_devices->seed = device->fs_devices->seed; 1364 fs_devices->seed = cur_devices->seed;
1266 device->fs_devices->seed = NULL; 1365 cur_devices->seed = NULL;
1267 __btrfs_close_devices(device->fs_devices); 1366 lock_chunks(root);
1268 free_fs_devices(device->fs_devices); 1367 __btrfs_close_devices(cur_devices);
1368 unlock_chunks(root);
1369 free_fs_devices(cur_devices);
1269 } 1370 }
1270 1371
1271 /* 1372 /*
1272 * at this point, the device is zero sized. We want to 1373 * at this point, the device is zero sized. We want to
1273 * remove it from the devices list and zero out the old super 1374 * remove it from the devices list and zero out the old super
1274 */ 1375 */
1275 if (device->writeable) { 1376 if (clear_super) {
1276 /* make sure this device isn't detected as part of 1377 /* make sure this device isn't detected as part of
1277 * the FS anymore 1378 * the FS anymore
1278 */ 1379 */
@@ -1281,19 +1382,26 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1281 sync_dirty_buffer(bh); 1382 sync_dirty_buffer(bh);
1282 } 1383 }
1283 1384
1284 kfree(device->name);
1285 kfree(device);
1286 ret = 0; 1385 ret = 0;
1287 1386
1288error_brelse: 1387error_brelse:
1289 brelse(bh); 1388 brelse(bh);
1290error_close: 1389error_close:
1291 if (bdev) 1390 if (bdev)
1292 close_bdev_exclusive(bdev, FMODE_READ); 1391 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1293out: 1392out:
1294 mutex_unlock(&root->fs_info->volume_mutex); 1393 mutex_unlock(&root->fs_info->volume_mutex);
1295 mutex_unlock(&uuid_mutex); 1394 mutex_unlock(&uuid_mutex);
1296 return ret; 1395 return ret;
1396error_undo:
1397 if (device->writeable) {
1398 lock_chunks(root);
1399 list_add(&device->dev_alloc_list,
1400 &root->fs_info->fs_devices->alloc_list);
1401 unlock_chunks(root);
1402 root->fs_info->fs_devices->rw_devices++;
1403 }
1404 goto error_brelse;
1297} 1405}
1298 1406
1299/* 1407/*
@@ -1330,7 +1438,12 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1330 INIT_LIST_HEAD(&seed_devices->devices); 1438 INIT_LIST_HEAD(&seed_devices->devices);
1331 INIT_LIST_HEAD(&seed_devices->alloc_list); 1439 INIT_LIST_HEAD(&seed_devices->alloc_list);
1332 mutex_init(&seed_devices->device_list_mutex); 1440 mutex_init(&seed_devices->device_list_mutex);
1333 list_splice_init(&fs_devices->devices, &seed_devices->devices); 1441
1442 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1443 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1444 synchronize_rcu);
1445 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1446
1334 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1447 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1335 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1448 list_for_each_entry(device, &seed_devices->devices, dev_list) {
1336 device->fs_devices = seed_devices; 1449 device->fs_devices = seed_devices;
@@ -1391,7 +1504,7 @@ next_slot:
1391 goto error; 1504 goto error;
1392 leaf = path->nodes[0]; 1505 leaf = path->nodes[0];
1393 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1506 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1394 btrfs_release_path(root, path); 1507 btrfs_release_path(path);
1395 continue; 1508 continue;
1396 } 1509 }
1397 1510
@@ -1441,7 +1554,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1441 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1554 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1442 return -EINVAL; 1555 return -EINVAL;
1443 1556
1444 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1557 bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
1558 root->fs_info->bdev_holder);
1445 if (IS_ERR(bdev)) 1559 if (IS_ERR(bdev))
1446 return PTR_ERR(bdev); 1560 return PTR_ERR(bdev);
1447 1561
@@ -1482,14 +1596,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1482 1596
1483 ret = find_next_devid(root, &device->devid); 1597 ret = find_next_devid(root, &device->devid);
1484 if (ret) { 1598 if (ret) {
1599 kfree(device->name);
1485 kfree(device); 1600 kfree(device);
1486 goto error; 1601 goto error;
1487 } 1602 }
1488 1603
1489 trans = btrfs_start_transaction(root, 0); 1604 trans = btrfs_start_transaction(root, 0);
1605 if (IS_ERR(trans)) {
1606 kfree(device->name);
1607 kfree(device);
1608 ret = PTR_ERR(trans);
1609 goto error;
1610 }
1611
1490 lock_chunks(root); 1612 lock_chunks(root);
1491 1613
1492 device->barriers = 1;
1493 device->writeable = 1; 1614 device->writeable = 1;
1494 device->work.func = pending_bios_fn; 1615 device->work.func = pending_bios_fn;
1495 generate_random_uuid(device->uuid); 1616 generate_random_uuid(device->uuid);
@@ -1503,7 +1624,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1503 device->dev_root = root->fs_info->dev_root; 1624 device->dev_root = root->fs_info->dev_root;
1504 device->bdev = bdev; 1625 device->bdev = bdev;
1505 device->in_fs_metadata = 1; 1626 device->in_fs_metadata = 1;
1506 device->mode = 0; 1627 device->mode = FMODE_EXCL;
1507 set_blocksize(device->bdev, 4096); 1628 set_blocksize(device->bdev, 4096);
1508 1629
1509 if (seeding_dev) { 1630 if (seeding_dev) {
@@ -1519,7 +1640,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1519 * half setup 1640 * half setup
1520 */ 1641 */
1521 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1642 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1522 list_add(&device->dev_list, &root->fs_info->fs_devices->devices); 1643 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1523 list_add(&device->dev_alloc_list, 1644 list_add(&device->dev_alloc_list,
1524 &root->fs_info->fs_devices->alloc_list); 1645 &root->fs_info->fs_devices->alloc_list);
1525 root->fs_info->fs_devices->num_devices++; 1646 root->fs_info->fs_devices->num_devices++;
@@ -1568,7 +1689,7 @@ out:
1568 mutex_unlock(&root->fs_info->volume_mutex); 1689 mutex_unlock(&root->fs_info->volume_mutex);
1569 return ret; 1690 return ret;
1570error: 1691error:
1571 close_bdev_exclusive(bdev, 0); 1692 blkdev_put(bdev, FMODE_EXCL);
1572 if (seeding_dev) { 1693 if (seeding_dev) {
1573 mutex_unlock(&uuid_mutex); 1694 mutex_unlock(&uuid_mutex);
1574 up_write(&sb->s_umount); 1695 up_write(&sb->s_umount);
@@ -1677,10 +1798,9 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1677 BUG_ON(ret); 1798 BUG_ON(ret);
1678 1799
1679 ret = btrfs_del_item(trans, root, path); 1800 ret = btrfs_del_item(trans, root, path);
1680 BUG_ON(ret);
1681 1801
1682 btrfs_free_path(path); 1802 btrfs_free_path(path);
1683 return 0; 1803 return ret;
1684} 1804}
1685 1805
1686static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1806static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
@@ -1755,7 +1875,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1755 return ret; 1875 return ret;
1756 1876
1757 trans = btrfs_start_transaction(root, 0); 1877 trans = btrfs_start_transaction(root, 0);
1758 BUG_ON(!trans); 1878 BUG_ON(IS_ERR(trans));
1759 1879
1760 lock_chunks(root); 1880 lock_chunks(root);
1761 1881
@@ -1786,6 +1906,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1786 1906
1787 BUG_ON(ret); 1907 BUG_ON(ret);
1788 1908
1909 trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
1910
1789 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1911 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1790 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1912 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1791 BUG_ON(ret); 1913 BUG_ON(ret);
@@ -1853,7 +1975,7 @@ again:
1853 chunk = btrfs_item_ptr(leaf, path->slots[0], 1975 chunk = btrfs_item_ptr(leaf, path->slots[0],
1854 struct btrfs_chunk); 1976 struct btrfs_chunk);
1855 chunk_type = btrfs_chunk_type(leaf, chunk); 1977 chunk_type = btrfs_chunk_type(leaf, chunk);
1856 btrfs_release_path(chunk_root, path); 1978 btrfs_release_path(path);
1857 1979
1858 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 1980 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1859 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 1981 ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
@@ -1901,7 +2023,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
1901 u64 size_to_free; 2023 u64 size_to_free;
1902 struct btrfs_path *path; 2024 struct btrfs_path *path;
1903 struct btrfs_key key; 2025 struct btrfs_key key;
1904 struct btrfs_chunk *chunk;
1905 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 2026 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1906 struct btrfs_trans_handle *trans; 2027 struct btrfs_trans_handle *trans;
1907 struct btrfs_key found_key; 2028 struct btrfs_key found_key;
@@ -1909,6 +2030,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
1909 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2030 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1910 return -EROFS; 2031 return -EROFS;
1911 2032
2033 if (!capable(CAP_SYS_ADMIN))
2034 return -EPERM;
2035
1912 mutex_lock(&dev_root->fs_info->volume_mutex); 2036 mutex_lock(&dev_root->fs_info->volume_mutex);
1913 dev_root = dev_root->fs_info->dev_root; 2037 dev_root = dev_root->fs_info->dev_root;
1914 2038
@@ -1927,7 +2051,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1927 BUG_ON(ret); 2051 BUG_ON(ret);
1928 2052
1929 trans = btrfs_start_transaction(dev_root, 0); 2053 trans = btrfs_start_transaction(dev_root, 0);
1930 BUG_ON(!trans); 2054 BUG_ON(IS_ERR(trans));
1931 2055
1932 ret = btrfs_grow_device(trans, device, old_size); 2056 ret = btrfs_grow_device(trans, device, old_size);
1933 BUG_ON(ret); 2057 BUG_ON(ret);
@@ -1965,19 +2089,17 @@ int btrfs_balance(struct btrfs_root *dev_root)
1965 if (found_key.objectid != key.objectid) 2089 if (found_key.objectid != key.objectid)
1966 break; 2090 break;
1967 2091
1968 chunk = btrfs_item_ptr(path->nodes[0],
1969 path->slots[0],
1970 struct btrfs_chunk);
1971 /* chunk zero is special */ 2092 /* chunk zero is special */
1972 if (found_key.offset == 0) 2093 if (found_key.offset == 0)
1973 break; 2094 break;
1974 2095
1975 btrfs_release_path(chunk_root, path); 2096 btrfs_release_path(path);
1976 ret = btrfs_relocate_chunk(chunk_root, 2097 ret = btrfs_relocate_chunk(chunk_root,
1977 chunk_root->root_key.objectid, 2098 chunk_root->root_key.objectid,
1978 found_key.objectid, 2099 found_key.objectid,
1979 found_key.offset); 2100 found_key.offset);
1980 BUG_ON(ret && ret != -ENOSPC); 2101 if (ret && ret != -ENOSPC)
2102 goto error;
1981 key.offset = found_key.offset - 1; 2103 key.offset = found_key.offset - 1;
1982 } 2104 }
1983 ret = 0; 2105 ret = 0;
@@ -2044,7 +2166,7 @@ again:
2044 goto done; 2166 goto done;
2045 if (ret) { 2167 if (ret) {
2046 ret = 0; 2168 ret = 0;
2047 btrfs_release_path(root, path); 2169 btrfs_release_path(path);
2048 break; 2170 break;
2049 } 2171 }
2050 2172
@@ -2053,7 +2175,7 @@ again:
2053 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2175 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2054 2176
2055 if (key.objectid != device->devid) { 2177 if (key.objectid != device->devid) {
2056 btrfs_release_path(root, path); 2178 btrfs_release_path(path);
2057 break; 2179 break;
2058 } 2180 }
2059 2181
@@ -2061,14 +2183,14 @@ again:
2061 length = btrfs_dev_extent_length(l, dev_extent); 2183 length = btrfs_dev_extent_length(l, dev_extent);
2062 2184
2063 if (key.offset + length <= new_size) { 2185 if (key.offset + length <= new_size) {
2064 btrfs_release_path(root, path); 2186 btrfs_release_path(path);
2065 break; 2187 break;
2066 } 2188 }
2067 2189
2068 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2190 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2069 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2191 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2070 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2192 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2071 btrfs_release_path(root, path); 2193 btrfs_release_path(path);
2072 2194
2073 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 2195 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2074 chunk_offset); 2196 chunk_offset);
@@ -2096,6 +2218,11 @@ again:
2096 2218
2097 /* Shrinking succeeded, else we would be at "done". */ 2219 /* Shrinking succeeded, else we would be at "done". */
2098 trans = btrfs_start_transaction(root, 0); 2220 trans = btrfs_start_transaction(root, 0);
2221 if (IS_ERR(trans)) {
2222 ret = PTR_ERR(trans);
2223 goto done;
2224 }
2225
2099 lock_chunks(root); 2226 lock_chunks(root);
2100 2227
2101 device->disk_total_bytes = new_size; 2228 device->disk_total_bytes = new_size;
@@ -2139,211 +2266,243 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2139 return 0; 2266 return 0;
2140} 2267}
2141 2268
2142static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, 2269/*
2143 int num_stripes, int sub_stripes) 2270 * sort the devices in descending order by max_avail, total_avail
2271 */
2272static int btrfs_cmp_device_info(const void *a, const void *b)
2144{ 2273{
2145 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) 2274 const struct btrfs_device_info *di_a = a;
2146 return calc_size; 2275 const struct btrfs_device_info *di_b = b;
2147 else if (type & BTRFS_BLOCK_GROUP_RAID10) 2276
2148 return calc_size * (num_stripes / sub_stripes); 2277 if (di_a->max_avail > di_b->max_avail)
2149 else 2278 return -1;
2150 return calc_size * num_stripes; 2279 if (di_a->max_avail < di_b->max_avail)
2280 return 1;
2281 if (di_a->total_avail > di_b->total_avail)
2282 return -1;
2283 if (di_a->total_avail < di_b->total_avail)
2284 return 1;
2285 return 0;
2151} 2286}
2152 2287
2153static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2288static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2154 struct btrfs_root *extent_root, 2289 struct btrfs_root *extent_root,
2155 struct map_lookup **map_ret, 2290 struct map_lookup **map_ret,
2156 u64 *num_bytes, u64 *stripe_size, 2291 u64 *num_bytes_out, u64 *stripe_size_out,
2157 u64 start, u64 type) 2292 u64 start, u64 type)
2158{ 2293{
2159 struct btrfs_fs_info *info = extent_root->fs_info; 2294 struct btrfs_fs_info *info = extent_root->fs_info;
2160 struct btrfs_device *device = NULL;
2161 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2295 struct btrfs_fs_devices *fs_devices = info->fs_devices;
2162 struct list_head *cur; 2296 struct list_head *cur;
2163 struct map_lookup *map = NULL; 2297 struct map_lookup *map = NULL;
2164 struct extent_map_tree *em_tree; 2298 struct extent_map_tree *em_tree;
2165 struct extent_map *em; 2299 struct extent_map *em;
2166 struct list_head private_devs; 2300 struct btrfs_device_info *devices_info = NULL;
2167 int min_stripe_size = 1 * 1024 * 1024; 2301 u64 total_avail;
2168 u64 calc_size = 1024 * 1024 * 1024; 2302 int num_stripes; /* total number of stripes to allocate */
2169 u64 max_chunk_size = calc_size; 2303 int sub_stripes; /* sub_stripes info for map */
2170 u64 min_free; 2304 int dev_stripes; /* stripes per dev */
2171 u64 avail; 2305 int devs_max; /* max devs to use */
2172 u64 max_avail = 0; 2306 int devs_min; /* min devs needed */
2173 u64 dev_offset; 2307 int devs_increment; /* ndevs has to be a multiple of this */
2174 int num_stripes = 1; 2308 int ncopies; /* how many copies to data has */
2175 int min_stripes = 1;
2176 int sub_stripes = 0;
2177 int looped = 0;
2178 int ret; 2309 int ret;
2179 int index; 2310 u64 max_stripe_size;
2180 int stripe_len = 64 * 1024; 2311 u64 max_chunk_size;
2312 u64 stripe_size;
2313 u64 num_bytes;
2314 int ndevs;
2315 int i;
2316 int j;
2181 2317
2182 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2318 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2183 (type & BTRFS_BLOCK_GROUP_DUP)) { 2319 (type & BTRFS_BLOCK_GROUP_DUP)) {
2184 WARN_ON(1); 2320 WARN_ON(1);
2185 type &= ~BTRFS_BLOCK_GROUP_DUP; 2321 type &= ~BTRFS_BLOCK_GROUP_DUP;
2186 } 2322 }
2323
2187 if (list_empty(&fs_devices->alloc_list)) 2324 if (list_empty(&fs_devices->alloc_list))
2188 return -ENOSPC; 2325 return -ENOSPC;
2189 2326
2190 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2327 sub_stripes = 1;
2191 num_stripes = fs_devices->rw_devices; 2328 dev_stripes = 1;
2192 min_stripes = 2; 2329 devs_increment = 1;
2193 } 2330 ncopies = 1;
2331 devs_max = 0; /* 0 == as many as possible */
2332 devs_min = 1;
2333
2334 /*
2335 * define the properties of each RAID type.
2336 * FIXME: move this to a global table and use it in all RAID
2337 * calculation code
2338 */
2194 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2339 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2195 num_stripes = 2; 2340 dev_stripes = 2;
2196 min_stripes = 2; 2341 ncopies = 2;
2197 } 2342 devs_max = 1;
2198 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2343 } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2199 if (fs_devices->rw_devices < 2) 2344 devs_min = 2;
2200 return -ENOSPC; 2345 } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2201 num_stripes = 2; 2346 devs_increment = 2;
2202 min_stripes = 2; 2347 ncopies = 2;
2203 } 2348 devs_max = 2;
2204 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2349 devs_min = 2;
2205 num_stripes = fs_devices->rw_devices; 2350 } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2206 if (num_stripes < 4)
2207 return -ENOSPC;
2208 num_stripes &= ~(u32)1;
2209 sub_stripes = 2; 2351 sub_stripes = 2;
2210 min_stripes = 4; 2352 devs_increment = 2;
2353 ncopies = 2;
2354 devs_min = 4;
2355 } else {
2356 devs_max = 1;
2211 } 2357 }
2212 2358
2213 if (type & BTRFS_BLOCK_GROUP_DATA) { 2359 if (type & BTRFS_BLOCK_GROUP_DATA) {
2214 max_chunk_size = 10 * calc_size; 2360 max_stripe_size = 1024 * 1024 * 1024;
2215 min_stripe_size = 64 * 1024 * 1024; 2361 max_chunk_size = 10 * max_stripe_size;
2216 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 2362 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2217 max_chunk_size = 256 * 1024 * 1024; 2363 max_stripe_size = 256 * 1024 * 1024;
2218 min_stripe_size = 32 * 1024 * 1024; 2364 max_chunk_size = max_stripe_size;
2219 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 2365 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2220 calc_size = 8 * 1024 * 1024; 2366 max_stripe_size = 8 * 1024 * 1024;
2221 max_chunk_size = calc_size * 2; 2367 max_chunk_size = 2 * max_stripe_size;
2222 min_stripe_size = 1 * 1024 * 1024; 2368 } else {
2369 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
2370 type);
2371 BUG_ON(1);
2223 } 2372 }
2224 2373
2225 /* we don't want a chunk larger than 10% of writeable space */ 2374 /* we don't want a chunk larger than 10% of writeable space */
2226 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2375 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2227 max_chunk_size); 2376 max_chunk_size);
2228 2377
2229again: 2378 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2230 max_avail = 0; 2379 GFP_NOFS);
2231 if (!map || map->num_stripes != num_stripes) { 2380 if (!devices_info)
2232 kfree(map); 2381 return -ENOMEM;
2233 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2234 if (!map)
2235 return -ENOMEM;
2236 map->num_stripes = num_stripes;
2237 }
2238
2239 if (calc_size * num_stripes > max_chunk_size) {
2240 calc_size = max_chunk_size;
2241 do_div(calc_size, num_stripes);
2242 do_div(calc_size, stripe_len);
2243 calc_size *= stripe_len;
2244 }
2245 2382
2246 /* we don't want tiny stripes */ 2383 cur = fs_devices->alloc_list.next;
2247 if (!looped)
2248 calc_size = max_t(u64, min_stripe_size, calc_size);
2249 2384
2250 /* 2385 /*
2251 * we're about to do_div by the stripe_len so lets make sure 2386 * in the first pass through the devices list, we gather information
2252 * we end up with something bigger than a stripe 2387 * about the available holes on each device.
2253 */ 2388 */
2254 calc_size = max_t(u64, calc_size, stripe_len * 4); 2389 ndevs = 0;
2390 while (cur != &fs_devices->alloc_list) {
2391 struct btrfs_device *device;
2392 u64 max_avail;
2393 u64 dev_offset;
2255 2394
2256 do_div(calc_size, stripe_len); 2395 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2257 calc_size *= stripe_len;
2258 2396
2259 cur = fs_devices->alloc_list.next; 2397 cur = cur->next;
2260 index = 0;
2261 2398
2262 if (type & BTRFS_BLOCK_GROUP_DUP) 2399 if (!device->writeable) {
2263 min_free = calc_size * 2; 2400 printk(KERN_ERR
2264 else 2401 "btrfs: read-only device in alloc_list\n");
2265 min_free = calc_size; 2402 WARN_ON(1);
2403 continue;
2404 }
2266 2405
2267 /* 2406 if (!device->in_fs_metadata)
2268 * we add 1MB because we never use the first 1MB of the device, unless 2407 continue;
2269 * we've looped, then we are likely allocating the maximum amount of
2270 * space left already
2271 */
2272 if (!looped)
2273 min_free += 1024 * 1024;
2274 2408
2275 INIT_LIST_HEAD(&private_devs);
2276 while (index < num_stripes) {
2277 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2278 BUG_ON(!device->writeable);
2279 if (device->total_bytes > device->bytes_used) 2409 if (device->total_bytes > device->bytes_used)
2280 avail = device->total_bytes - device->bytes_used; 2410 total_avail = device->total_bytes - device->bytes_used;
2281 else 2411 else
2282 avail = 0; 2412 total_avail = 0;
2283 cur = cur->next; 2413 /* avail is off by max(alloc_start, 1MB), but that is the same
2414 * for all devices, so it doesn't hurt the sorting later on
2415 */
2284 2416
2285 if (device->in_fs_metadata && avail >= min_free) { 2417 ret = find_free_dev_extent(trans, device,
2286 ret = find_free_dev_extent(trans, device, 2418 max_stripe_size * dev_stripes,
2287 min_free, &dev_offset, 2419 &dev_offset, &max_avail);
2288 &max_avail); 2420 if (ret && ret != -ENOSPC)
2289 if (ret == 0) { 2421 goto error;
2290 list_move_tail(&device->dev_alloc_list, 2422
2291 &private_devs); 2423 if (ret == 0)
2292 map->stripes[index].dev = device; 2424 max_avail = max_stripe_size * dev_stripes;
2293 map->stripes[index].physical = dev_offset; 2425
2294 index++; 2426 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
2295 if (type & BTRFS_BLOCK_GROUP_DUP) { 2427 continue;
2296 map->stripes[index].dev = device; 2428
2297 map->stripes[index].physical = 2429 devices_info[ndevs].dev_offset = dev_offset;
2298 dev_offset + calc_size; 2430 devices_info[ndevs].max_avail = max_avail;
2299 index++; 2431 devices_info[ndevs].total_avail = total_avail;
2300 } 2432 devices_info[ndevs].dev = device;
2301 } 2433 ++ndevs;
2302 } else if (device->in_fs_metadata && avail > max_avail)
2303 max_avail = avail;
2304 if (cur == &fs_devices->alloc_list)
2305 break;
2306 } 2434 }
2307 list_splice(&private_devs, &fs_devices->alloc_list); 2435
2308 if (index < num_stripes) { 2436 /*
2309 if (index >= min_stripes) { 2437 * now sort the devices by hole size / available space
2310 num_stripes = index; 2438 */
2311 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2439 sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
2312 num_stripes /= sub_stripes; 2440 btrfs_cmp_device_info, NULL);
2313 num_stripes *= sub_stripes; 2441
2314 } 2442 /* round down to number of usable stripes */
2315 looped = 1; 2443 ndevs -= ndevs % devs_increment;
2316 goto again; 2444
2317 } 2445 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
2318 if (!looped && max_avail > 0) { 2446 ret = -ENOSPC;
2319 looped = 1; 2447 goto error;
2320 calc_size = max_avail; 2448 }
2321 goto again; 2449
2450 if (devs_max && ndevs > devs_max)
2451 ndevs = devs_max;
2452 /*
2453 * the primary goal is to maximize the number of stripes, so use as many
2454 * devices as possible, even if the stripes are not maximum sized.
2455 */
2456 stripe_size = devices_info[ndevs-1].max_avail;
2457 num_stripes = ndevs * dev_stripes;
2458
2459 if (stripe_size * num_stripes > max_chunk_size * ncopies) {
2460 stripe_size = max_chunk_size * ncopies;
2461 do_div(stripe_size, num_stripes);
2462 }
2463
2464 do_div(stripe_size, dev_stripes);
2465 do_div(stripe_size, BTRFS_STRIPE_LEN);
2466 stripe_size *= BTRFS_STRIPE_LEN;
2467
2468 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2469 if (!map) {
2470 ret = -ENOMEM;
2471 goto error;
2472 }
2473 map->num_stripes = num_stripes;
2474
2475 for (i = 0; i < ndevs; ++i) {
2476 for (j = 0; j < dev_stripes; ++j) {
2477 int s = i * dev_stripes + j;
2478 map->stripes[s].dev = devices_info[i].dev;
2479 map->stripes[s].physical = devices_info[i].dev_offset +
2480 j * stripe_size;
2322 } 2481 }
2323 kfree(map);
2324 return -ENOSPC;
2325 } 2482 }
2326 map->sector_size = extent_root->sectorsize; 2483 map->sector_size = extent_root->sectorsize;
2327 map->stripe_len = stripe_len; 2484 map->stripe_len = BTRFS_STRIPE_LEN;
2328 map->io_align = stripe_len; 2485 map->io_align = BTRFS_STRIPE_LEN;
2329 map->io_width = stripe_len; 2486 map->io_width = BTRFS_STRIPE_LEN;
2330 map->type = type; 2487 map->type = type;
2331 map->num_stripes = num_stripes;
2332 map->sub_stripes = sub_stripes; 2488 map->sub_stripes = sub_stripes;
2333 2489
2334 *map_ret = map; 2490 *map_ret = map;
2335 *stripe_size = calc_size; 2491 num_bytes = stripe_size * (num_stripes / ncopies);
2336 *num_bytes = chunk_bytes_by_type(type, calc_size, 2492
2337 num_stripes, sub_stripes); 2493 *stripe_size_out = stripe_size;
2494 *num_bytes_out = num_bytes;
2338 2495
2339 em = alloc_extent_map(GFP_NOFS); 2496 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
2497
2498 em = alloc_extent_map();
2340 if (!em) { 2499 if (!em) {
2341 kfree(map); 2500 ret = -ENOMEM;
2342 return -ENOMEM; 2501 goto error;
2343 } 2502 }
2344 em->bdev = (struct block_device *)map; 2503 em->bdev = (struct block_device *)map;
2345 em->start = start; 2504 em->start = start;
2346 em->len = *num_bytes; 2505 em->len = num_bytes;
2347 em->block_start = 0; 2506 em->block_start = 0;
2348 em->block_len = em->len; 2507 em->block_len = em->len;
2349 2508
@@ -2356,23 +2515,30 @@ again:
2356 2515
2357 ret = btrfs_make_block_group(trans, extent_root, 0, type, 2516 ret = btrfs_make_block_group(trans, extent_root, 0, type,
2358 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2517 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2359 start, *num_bytes); 2518 start, num_bytes);
2360 BUG_ON(ret); 2519 BUG_ON(ret);
2361 2520
2362 index = 0; 2521 for (i = 0; i < map->num_stripes; ++i) {
2363 while (index < map->num_stripes) { 2522 struct btrfs_device *device;
2364 device = map->stripes[index].dev; 2523 u64 dev_offset;
2365 dev_offset = map->stripes[index].physical; 2524
2525 device = map->stripes[i].dev;
2526 dev_offset = map->stripes[i].physical;
2366 2527
2367 ret = btrfs_alloc_dev_extent(trans, device, 2528 ret = btrfs_alloc_dev_extent(trans, device,
2368 info->chunk_root->root_key.objectid, 2529 info->chunk_root->root_key.objectid,
2369 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2530 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2370 start, dev_offset, calc_size); 2531 start, dev_offset, stripe_size);
2371 BUG_ON(ret); 2532 BUG_ON(ret);
2372 index++;
2373 } 2533 }
2374 2534
2535 kfree(devices_info);
2375 return 0; 2536 return 0;
2537
2538error:
2539 kfree(map);
2540 kfree(devices_info);
2541 return ret;
2376} 2542}
2377 2543
2378static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2544static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
@@ -2438,6 +2604,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2438 item_size); 2604 item_size);
2439 BUG_ON(ret); 2605 BUG_ON(ret);
2440 } 2606 }
2607
2441 kfree(chunk); 2608 kfree(chunk);
2442 return 0; 2609 return 0;
2443} 2610}
@@ -2569,7 +2736,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2569 2736
2570void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 2737void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2571{ 2738{
2572 extent_map_tree_init(&tree->map_tree, GFP_NOFS); 2739 extent_map_tree_init(&tree->map_tree);
2573} 2740}
2574 2741
2575void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 2742void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
@@ -2635,14 +2802,17 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2635static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2802static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2636 u64 logical, u64 *length, 2803 u64 logical, u64 *length,
2637 struct btrfs_multi_bio **multi_ret, 2804 struct btrfs_multi_bio **multi_ret,
2638 int mirror_num, struct page *unplug_page) 2805 int mirror_num)
2639{ 2806{
2640 struct extent_map *em; 2807 struct extent_map *em;
2641 struct map_lookup *map; 2808 struct map_lookup *map;
2642 struct extent_map_tree *em_tree = &map_tree->map_tree; 2809 struct extent_map_tree *em_tree = &map_tree->map_tree;
2643 u64 offset; 2810 u64 offset;
2644 u64 stripe_offset; 2811 u64 stripe_offset;
2812 u64 stripe_end_offset;
2645 u64 stripe_nr; 2813 u64 stripe_nr;
2814 u64 stripe_nr_orig;
2815 u64 stripe_nr_end;
2646 int stripes_allocated = 8; 2816 int stripes_allocated = 8;
2647 int stripes_required = 1; 2817 int stripes_required = 1;
2648 int stripe_index; 2818 int stripe_index;
@@ -2651,7 +2821,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2651 int max_errors = 0; 2821 int max_errors = 0;
2652 struct btrfs_multi_bio *multi = NULL; 2822 struct btrfs_multi_bio *multi = NULL;
2653 2823
2654 if (multi_ret && !(rw & REQ_WRITE)) 2824 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2655 stripes_allocated = 1; 2825 stripes_allocated = 1;
2656again: 2826again:
2657 if (multi_ret) { 2827 if (multi_ret) {
@@ -2667,11 +2837,6 @@ again:
2667 em = lookup_extent_mapping(em_tree, logical, *length); 2837 em = lookup_extent_mapping(em_tree, logical, *length);
2668 read_unlock(&em_tree->lock); 2838 read_unlock(&em_tree->lock);
2669 2839
2670 if (!em && unplug_page) {
2671 kfree(multi);
2672 return 0;
2673 }
2674
2675 if (!em) { 2840 if (!em) {
2676 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2841 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2677 (unsigned long long)logical, 2842 (unsigned long long)logical,
@@ -2697,7 +2862,15 @@ again:
2697 max_errors = 1; 2862 max_errors = 1;
2698 } 2863 }
2699 } 2864 }
2700 if (multi_ret && (rw & REQ_WRITE) && 2865 if (rw & REQ_DISCARD) {
2866 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2867 BTRFS_BLOCK_GROUP_RAID1 |
2868 BTRFS_BLOCK_GROUP_DUP |
2869 BTRFS_BLOCK_GROUP_RAID10)) {
2870 stripes_required = map->num_stripes;
2871 }
2872 }
2873 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2701 stripes_allocated < stripes_required) { 2874 stripes_allocated < stripes_required) {
2702 stripes_allocated = map->num_stripes; 2875 stripes_allocated = map->num_stripes;
2703 free_extent_map(em); 2876 free_extent_map(em);
@@ -2717,23 +2890,37 @@ again:
2717 /* stripe_offset is the offset of this block in its stripe*/ 2890 /* stripe_offset is the offset of this block in its stripe*/
2718 stripe_offset = offset - stripe_offset; 2891 stripe_offset = offset - stripe_offset;
2719 2892
2720 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 2893 if (rw & REQ_DISCARD)
2721 BTRFS_BLOCK_GROUP_RAID10 | 2894 *length = min_t(u64, em->len - offset, *length);
2722 BTRFS_BLOCK_GROUP_DUP)) { 2895 else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2896 BTRFS_BLOCK_GROUP_RAID1 |
2897 BTRFS_BLOCK_GROUP_RAID10 |
2898 BTRFS_BLOCK_GROUP_DUP)) {
2723 /* we limit the length of each bio to what fits in a stripe */ 2899 /* we limit the length of each bio to what fits in a stripe */
2724 *length = min_t(u64, em->len - offset, 2900 *length = min_t(u64, em->len - offset,
2725 map->stripe_len - stripe_offset); 2901 map->stripe_len - stripe_offset);
2726 } else { 2902 } else {
2727 *length = em->len - offset; 2903 *length = em->len - offset;
2728 } 2904 }
2729 2905
2730 if (!multi_ret && !unplug_page) 2906 if (!multi_ret)
2731 goto out; 2907 goto out;
2732 2908
2733 num_stripes = 1; 2909 num_stripes = 1;
2734 stripe_index = 0; 2910 stripe_index = 0;
2735 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 2911 stripe_nr_orig = stripe_nr;
2736 if (unplug_page || (rw & REQ_WRITE)) 2912 stripe_nr_end = (offset + *length + map->stripe_len - 1) &
2913 (~(map->stripe_len - 1));
2914 do_div(stripe_nr_end, map->stripe_len);
2915 stripe_end_offset = stripe_nr_end * map->stripe_len -
2916 (offset + *length);
2917 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2918 if (rw & REQ_DISCARD)
2919 num_stripes = min_t(u64, map->num_stripes,
2920 stripe_nr_end - stripe_nr_orig);
2921 stripe_index = do_div(stripe_nr, map->num_stripes);
2922 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2923 if (rw & (REQ_WRITE | REQ_DISCARD))
2737 num_stripes = map->num_stripes; 2924 num_stripes = map->num_stripes;
2738 else if (mirror_num) 2925 else if (mirror_num)
2739 stripe_index = mirror_num - 1; 2926 stripe_index = mirror_num - 1;
@@ -2744,7 +2931,7 @@ again:
2744 } 2931 }
2745 2932
2746 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2933 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2747 if (rw & REQ_WRITE) 2934 if (rw & (REQ_WRITE | REQ_DISCARD))
2748 num_stripes = map->num_stripes; 2935 num_stripes = map->num_stripes;
2749 else if (mirror_num) 2936 else if (mirror_num)
2750 stripe_index = mirror_num - 1; 2937 stripe_index = mirror_num - 1;
@@ -2755,8 +2942,12 @@ again:
2755 stripe_index = do_div(stripe_nr, factor); 2942 stripe_index = do_div(stripe_nr, factor);
2756 stripe_index *= map->sub_stripes; 2943 stripe_index *= map->sub_stripes;
2757 2944
2758 if (unplug_page || (rw & REQ_WRITE)) 2945 if (rw & REQ_WRITE)
2759 num_stripes = map->sub_stripes; 2946 num_stripes = map->sub_stripes;
2947 else if (rw & REQ_DISCARD)
2948 num_stripes = min_t(u64, map->sub_stripes *
2949 (stripe_nr_end - stripe_nr_orig),
2950 map->num_stripes);
2760 else if (mirror_num) 2951 else if (mirror_num)
2761 stripe_index += mirror_num - 1; 2952 stripe_index += mirror_num - 1;
2762 else { 2953 else {
@@ -2774,24 +2965,101 @@ again:
2774 } 2965 }
2775 BUG_ON(stripe_index >= map->num_stripes); 2966 BUG_ON(stripe_index >= map->num_stripes);
2776 2967
2777 for (i = 0; i < num_stripes; i++) { 2968 if (rw & REQ_DISCARD) {
2778 if (unplug_page) { 2969 for (i = 0; i < num_stripes; i++) {
2779 struct btrfs_device *device;
2780 struct backing_dev_info *bdi;
2781
2782 device = map->stripes[stripe_index].dev;
2783 if (device->bdev) {
2784 bdi = blk_get_backing_dev_info(device->bdev);
2785 if (bdi->unplug_io_fn)
2786 bdi->unplug_io_fn(bdi, unplug_page);
2787 }
2788 } else {
2789 multi->stripes[i].physical = 2970 multi->stripes[i].physical =
2790 map->stripes[stripe_index].physical + 2971 map->stripes[stripe_index].physical +
2791 stripe_offset + stripe_nr * map->stripe_len; 2972 stripe_offset + stripe_nr * map->stripe_len;
2792 multi->stripes[i].dev = map->stripes[stripe_index].dev; 2973 multi->stripes[i].dev = map->stripes[stripe_index].dev;
2974
2975 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2976 u64 stripes;
2977 u32 last_stripe = 0;
2978 int j;
2979
2980 div_u64_rem(stripe_nr_end - 1,
2981 map->num_stripes,
2982 &last_stripe);
2983
2984 for (j = 0; j < map->num_stripes; j++) {
2985 u32 test;
2986
2987 div_u64_rem(stripe_nr_end - 1 - j,
2988 map->num_stripes, &test);
2989 if (test == stripe_index)
2990 break;
2991 }
2992 stripes = stripe_nr_end - 1 - j;
2993 do_div(stripes, map->num_stripes);
2994 multi->stripes[i].length = map->stripe_len *
2995 (stripes - stripe_nr + 1);
2996
2997 if (i == 0) {
2998 multi->stripes[i].length -=
2999 stripe_offset;
3000 stripe_offset = 0;
3001 }
3002 if (stripe_index == last_stripe)
3003 multi->stripes[i].length -=
3004 stripe_end_offset;
3005 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3006 u64 stripes;
3007 int j;
3008 int factor = map->num_stripes /
3009 map->sub_stripes;
3010 u32 last_stripe = 0;
3011
3012 div_u64_rem(stripe_nr_end - 1,
3013 factor, &last_stripe);
3014 last_stripe *= map->sub_stripes;
3015
3016 for (j = 0; j < factor; j++) {
3017 u32 test;
3018
3019 div_u64_rem(stripe_nr_end - 1 - j,
3020 factor, &test);
3021
3022 if (test ==
3023 stripe_index / map->sub_stripes)
3024 break;
3025 }
3026 stripes = stripe_nr_end - 1 - j;
3027 do_div(stripes, factor);
3028 multi->stripes[i].length = map->stripe_len *
3029 (stripes - stripe_nr + 1);
3030
3031 if (i < map->sub_stripes) {
3032 multi->stripes[i].length -=
3033 stripe_offset;
3034 if (i == map->sub_stripes - 1)
3035 stripe_offset = 0;
3036 }
3037 if (stripe_index >= last_stripe &&
3038 stripe_index <= (last_stripe +
3039 map->sub_stripes - 1)) {
3040 multi->stripes[i].length -=
3041 stripe_end_offset;
3042 }
3043 } else
3044 multi->stripes[i].length = *length;
3045
3046 stripe_index++;
3047 if (stripe_index == map->num_stripes) {
3048 /* This could only happen for RAID0/10 */
3049 stripe_index = 0;
3050 stripe_nr++;
3051 }
3052 }
3053 } else {
3054 for (i = 0; i < num_stripes; i++) {
3055 multi->stripes[i].physical =
3056 map->stripes[stripe_index].physical +
3057 stripe_offset +
3058 stripe_nr * map->stripe_len;
3059 multi->stripes[i].dev =
3060 map->stripes[stripe_index].dev;
3061 stripe_index++;
2793 } 3062 }
2794 stripe_index++;
2795 } 3063 }
2796 if (multi_ret) { 3064 if (multi_ret) {
2797 *multi_ret = multi; 3065 *multi_ret = multi;
@@ -2808,7 +3076,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2808 struct btrfs_multi_bio **multi_ret, int mirror_num) 3076 struct btrfs_multi_bio **multi_ret, int mirror_num)
2809{ 3077{
2810 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3078 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
2811 mirror_num, NULL); 3079 mirror_num);
2812} 3080}
2813 3081
2814int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 3082int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -2876,14 +3144,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2876 return 0; 3144 return 0;
2877} 3145}
2878 3146
2879int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
2880 u64 logical, struct page *page)
2881{
2882 u64 length = PAGE_CACHE_SIZE;
2883 return __btrfs_map_block(map_tree, READ, logical, &length,
2884 NULL, 0, page);
2885}
2886
2887static void end_bio_multi_stripe(struct bio *bio, int err) 3147static void end_bio_multi_stripe(struct bio *bio, int err)
2888{ 3148{
2889 struct btrfs_multi_bio *multi = bio->bi_private; 3149 struct btrfs_multi_bio *multi = bio->bi_private;
@@ -3034,8 +3294,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3034 } 3294 }
3035 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3295 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
3036 dev = multi->stripes[dev_nr].dev; 3296 dev = multi->stripes[dev_nr].dev;
3037 BUG_ON(rw == WRITE && !dev->writeable); 3297 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3038 if (dev && dev->bdev) {
3039 bio->bi_bdev = dev->bdev; 3298 bio->bi_bdev = dev->bdev;
3040 if (async_submit) 3299 if (async_submit)
3041 schedule_bio(root, dev, rw, bio); 3300 schedule_bio(root, dev, rw, bio);
@@ -3084,12 +3343,13 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3084 return NULL; 3343 return NULL;
3085 list_add(&device->dev_list, 3344 list_add(&device->dev_list,
3086 &fs_devices->devices); 3345 &fs_devices->devices);
3087 device->barriers = 1;
3088 device->dev_root = root->fs_info->dev_root; 3346 device->dev_root = root->fs_info->dev_root;
3089 device->devid = devid; 3347 device->devid = devid;
3090 device->work.func = pending_bios_fn; 3348 device->work.func = pending_bios_fn;
3091 device->fs_devices = fs_devices; 3349 device->fs_devices = fs_devices;
3350 device->missing = 1;
3092 fs_devices->num_devices++; 3351 fs_devices->num_devices++;
3352 fs_devices->missing_devices++;
3093 spin_lock_init(&device->io_lock); 3353 spin_lock_init(&device->io_lock);
3094 INIT_LIST_HEAD(&device->dev_alloc_list); 3354 INIT_LIST_HEAD(&device->dev_alloc_list);
3095 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 3355 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@ -3126,7 +3386,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
3126 free_extent_map(em); 3386 free_extent_map(em);
3127 } 3387 }
3128 3388
3129 em = alloc_extent_map(GFP_NOFS); 3389 em = alloc_extent_map();
3130 if (!em) 3390 if (!em)
3131 return -ENOMEM; 3391 return -ENOMEM;
3132 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3392 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
@@ -3287,6 +3547,15 @@ static int read_one_dev(struct btrfs_root *root,
3287 device = add_missing_dev(root, devid, dev_uuid); 3547 device = add_missing_dev(root, devid, dev_uuid);
3288 if (!device) 3548 if (!device)
3289 return -ENOMEM; 3549 return -ENOMEM;
3550 } else if (!device->missing) {
3551 /*
3552 * this happens when a device that was properly setup
3553 * in the device info lists suddenly goes bad.
3554 * device->bdev is NULL, and so we have to set
3555 * device->missing to one here
3556 */
3557 root->fs_info->fs_devices->missing_devices++;
3558 device->missing = 1;
3290 } 3559 }
3291 } 3560 }
3292 3561
@@ -3306,15 +3575,6 @@ static int read_one_dev(struct btrfs_root *root,
3306 return ret; 3575 return ret;
3307} 3576}
3308 3577
3309int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
3310{
3311 struct btrfs_dev_item *dev_item;
3312
3313 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
3314 dev_item);
3315 return read_one_dev(root, buf, dev_item);
3316}
3317
3318int btrfs_read_sys_array(struct btrfs_root *root) 3578int btrfs_read_sys_array(struct btrfs_root *root)
3319{ 3579{
3320 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3580 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
@@ -3431,7 +3691,7 @@ again:
3431 } 3691 }
3432 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3692 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3433 key.objectid = 0; 3693 key.objectid = 0;
3434 btrfs_release_path(root, path); 3694 btrfs_release_path(path);
3435 goto again; 3695 goto again;
3436 } 3696 }
3437 ret = 0; 3697 ret = 0;