diff options
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r-- | fs/btrfs/volumes.c | 1094 |
1 files changed, 677 insertions, 417 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dd318ff280b2..19450bc53632 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/blkdev.h> | 22 | #include <linux/blkdev.h> |
23 | #include <linux/random.h> | 23 | #include <linux/random.h> |
24 | #include <linux/iocontext.h> | 24 | #include <linux/iocontext.h> |
25 | #include <linux/capability.h> | ||
25 | #include <asm/div64.h> | 26 | #include <asm/div64.h> |
26 | #include "compat.h" | 27 | #include "compat.h" |
27 | #include "ctree.h" | 28 | #include "ctree.h" |
@@ -32,38 +33,14 @@ | |||
32 | #include "volumes.h" | 33 | #include "volumes.h" |
33 | #include "async-thread.h" | 34 | #include "async-thread.h" |
34 | 35 | ||
35 | struct map_lookup { | ||
36 | u64 type; | ||
37 | int io_align; | ||
38 | int io_width; | ||
39 | int stripe_len; | ||
40 | int sector_size; | ||
41 | int num_stripes; | ||
42 | int sub_stripes; | ||
43 | struct btrfs_bio_stripe stripes[]; | ||
44 | }; | ||
45 | |||
46 | static int init_first_rw_device(struct btrfs_trans_handle *trans, | 36 | static int init_first_rw_device(struct btrfs_trans_handle *trans, |
47 | struct btrfs_root *root, | 37 | struct btrfs_root *root, |
48 | struct btrfs_device *device); | 38 | struct btrfs_device *device); |
49 | static int btrfs_relocate_sys_chunks(struct btrfs_root *root); | 39 | static int btrfs_relocate_sys_chunks(struct btrfs_root *root); |
50 | 40 | ||
51 | #define map_lookup_size(n) (sizeof(struct map_lookup) + \ | ||
52 | (sizeof(struct btrfs_bio_stripe) * (n))) | ||
53 | |||
54 | static DEFINE_MUTEX(uuid_mutex); | 41 | static DEFINE_MUTEX(uuid_mutex); |
55 | static LIST_HEAD(fs_uuids); | 42 | static LIST_HEAD(fs_uuids); |
56 | 43 | ||
57 | void btrfs_lock_volumes(void) | ||
58 | { | ||
59 | mutex_lock(&uuid_mutex); | ||
60 | } | ||
61 | |||
62 | void btrfs_unlock_volumes(void) | ||
63 | { | ||
64 | mutex_unlock(&uuid_mutex); | ||
65 | } | ||
66 | |||
67 | static void lock_chunks(struct btrfs_root *root) | 44 | static void lock_chunks(struct btrfs_root *root) |
68 | { | 45 | { |
69 | mutex_lock(&root->fs_info->chunk_mutex); | 46 | mutex_lock(&root->fs_info->chunk_mutex); |
@@ -161,22 +138,25 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) | |||
161 | struct bio *cur; | 138 | struct bio *cur; |
162 | int again = 0; | 139 | int again = 0; |
163 | unsigned long num_run; | 140 | unsigned long num_run; |
164 | unsigned long num_sync_run; | ||
165 | unsigned long batch_run = 0; | 141 | unsigned long batch_run = 0; |
166 | unsigned long limit; | 142 | unsigned long limit; |
167 | unsigned long last_waited = 0; | 143 | unsigned long last_waited = 0; |
168 | int force_reg = 0; | 144 | int force_reg = 0; |
145 | struct blk_plug plug; | ||
146 | |||
147 | /* | ||
148 | * this function runs all the bios we've collected for | ||
149 | * a particular device. We don't want to wander off to | ||
150 | * another device without first sending all of these down. | ||
151 | * So, setup a plug here and finish it off before we return | ||
152 | */ | ||
153 | blk_start_plug(&plug); | ||
169 | 154 | ||
170 | bdi = blk_get_backing_dev_info(device->bdev); | 155 | bdi = blk_get_backing_dev_info(device->bdev); |
171 | fs_info = device->dev_root->fs_info; | 156 | fs_info = device->dev_root->fs_info; |
172 | limit = btrfs_async_submit_limit(fs_info); | 157 | limit = btrfs_async_submit_limit(fs_info); |
173 | limit = limit * 2 / 3; | 158 | limit = limit * 2 / 3; |
174 | 159 | ||
175 | /* we want to make sure that every time we switch from the sync | ||
176 | * list to the normal list, we unplug | ||
177 | */ | ||
178 | num_sync_run = 0; | ||
179 | |||
180 | loop: | 160 | loop: |
181 | spin_lock(&device->io_lock); | 161 | spin_lock(&device->io_lock); |
182 | 162 | ||
@@ -222,15 +202,6 @@ loop_lock: | |||
222 | 202 | ||
223 | spin_unlock(&device->io_lock); | 203 | spin_unlock(&device->io_lock); |
224 | 204 | ||
225 | /* | ||
226 | * if we're doing the regular priority list, make sure we unplug | ||
227 | * for any high prio bios we've sent down | ||
228 | */ | ||
229 | if (pending_bios == &device->pending_bios && num_sync_run > 0) { | ||
230 | num_sync_run = 0; | ||
231 | blk_run_backing_dev(bdi, NULL); | ||
232 | } | ||
233 | |||
234 | while (pending) { | 205 | while (pending) { |
235 | 206 | ||
236 | rmb(); | 207 | rmb(); |
@@ -258,19 +229,11 @@ loop_lock: | |||
258 | 229 | ||
259 | BUG_ON(atomic_read(&cur->bi_cnt) == 0); | 230 | BUG_ON(atomic_read(&cur->bi_cnt) == 0); |
260 | 231 | ||
261 | if (cur->bi_rw & REQ_SYNC) | ||
262 | num_sync_run++; | ||
263 | |||
264 | submit_bio(cur->bi_rw, cur); | 232 | submit_bio(cur->bi_rw, cur); |
265 | num_run++; | 233 | num_run++; |
266 | batch_run++; | 234 | batch_run++; |
267 | if (need_resched()) { | 235 | if (need_resched()) |
268 | if (num_sync_run) { | ||
269 | blk_run_backing_dev(bdi, NULL); | ||
270 | num_sync_run = 0; | ||
271 | } | ||
272 | cond_resched(); | 236 | cond_resched(); |
273 | } | ||
274 | 237 | ||
275 | /* | 238 | /* |
276 | * we made progress, there is more work to do and the bdi | 239 | * we made progress, there is more work to do and the bdi |
@@ -303,13 +266,8 @@ loop_lock: | |||
303 | * against it before looping | 266 | * against it before looping |
304 | */ | 267 | */ |
305 | last_waited = ioc->last_waited; | 268 | last_waited = ioc->last_waited; |
306 | if (need_resched()) { | 269 | if (need_resched()) |
307 | if (num_sync_run) { | ||
308 | blk_run_backing_dev(bdi, NULL); | ||
309 | num_sync_run = 0; | ||
310 | } | ||
311 | cond_resched(); | 270 | cond_resched(); |
312 | } | ||
313 | continue; | 271 | continue; |
314 | } | 272 | } |
315 | spin_lock(&device->io_lock); | 273 | spin_lock(&device->io_lock); |
@@ -322,22 +280,6 @@ loop_lock: | |||
322 | } | 280 | } |
323 | } | 281 | } |
324 | 282 | ||
325 | if (num_sync_run) { | ||
326 | num_sync_run = 0; | ||
327 | blk_run_backing_dev(bdi, NULL); | ||
328 | } | ||
329 | /* | ||
330 | * IO has already been through a long path to get here. Checksumming, | ||
331 | * async helper threads, perhaps compression. We've done a pretty | ||
332 | * good job of collecting a batch of IO and should just unplug | ||
333 | * the device right away. | ||
334 | * | ||
335 | * This will help anyone who is waiting on the IO, they might have | ||
336 | * already unplugged, but managed to do so before the bio they | ||
337 | * cared about found its way down here. | ||
338 | */ | ||
339 | blk_run_backing_dev(bdi, NULL); | ||
340 | |||
341 | cond_resched(); | 283 | cond_resched(); |
342 | if (again) | 284 | if (again) |
343 | goto loop; | 285 | goto loop; |
@@ -348,6 +290,7 @@ loop_lock: | |||
348 | spin_unlock(&device->io_lock); | 290 | spin_unlock(&device->io_lock); |
349 | 291 | ||
350 | done: | 292 | done: |
293 | blk_finish_plug(&plug); | ||
351 | return 0; | 294 | return 0; |
352 | } | 295 | } |
353 | 296 | ||
@@ -398,7 +341,6 @@ static noinline int device_list_add(const char *path, | |||
398 | device->work.func = pending_bios_fn; | 341 | device->work.func = pending_bios_fn; |
399 | memcpy(device->uuid, disk_super->dev_item.uuid, | 342 | memcpy(device->uuid, disk_super->dev_item.uuid, |
400 | BTRFS_UUID_SIZE); | 343 | BTRFS_UUID_SIZE); |
401 | device->barriers = 1; | ||
402 | spin_lock_init(&device->io_lock); | 344 | spin_lock_init(&device->io_lock); |
403 | device->name = kstrdup(path, GFP_NOFS); | 345 | device->name = kstrdup(path, GFP_NOFS); |
404 | if (!device->name) { | 346 | if (!device->name) { |
@@ -408,17 +350,21 @@ static noinline int device_list_add(const char *path, | |||
408 | INIT_LIST_HEAD(&device->dev_alloc_list); | 350 | INIT_LIST_HEAD(&device->dev_alloc_list); |
409 | 351 | ||
410 | mutex_lock(&fs_devices->device_list_mutex); | 352 | mutex_lock(&fs_devices->device_list_mutex); |
411 | list_add(&device->dev_list, &fs_devices->devices); | 353 | list_add_rcu(&device->dev_list, &fs_devices->devices); |
412 | mutex_unlock(&fs_devices->device_list_mutex); | 354 | mutex_unlock(&fs_devices->device_list_mutex); |
413 | 355 | ||
414 | device->fs_devices = fs_devices; | 356 | device->fs_devices = fs_devices; |
415 | fs_devices->num_devices++; | 357 | fs_devices->num_devices++; |
416 | } else if (strcmp(device->name, path)) { | 358 | } else if (!device->name || strcmp(device->name, path)) { |
417 | name = kstrdup(path, GFP_NOFS); | 359 | name = kstrdup(path, GFP_NOFS); |
418 | if (!name) | 360 | if (!name) |
419 | return -ENOMEM; | 361 | return -ENOMEM; |
420 | kfree(device->name); | 362 | kfree(device->name); |
421 | device->name = name; | 363 | device->name = name; |
364 | if (device->missing) { | ||
365 | fs_devices->missing_devices--; | ||
366 | device->missing = 0; | ||
367 | } | ||
422 | } | 368 | } |
423 | 369 | ||
424 | if (found_transid > fs_devices->latest_trans) { | 370 | if (found_transid > fs_devices->latest_trans) { |
@@ -447,7 +393,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) | |||
447 | fs_devices->latest_trans = orig->latest_trans; | 393 | fs_devices->latest_trans = orig->latest_trans; |
448 | memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); | 394 | memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); |
449 | 395 | ||
450 | mutex_lock(&orig->device_list_mutex); | 396 | /* We have held the volume lock, it is safe to get the devices. */ |
451 | list_for_each_entry(orig_dev, &orig->devices, dev_list) { | 397 | list_for_each_entry(orig_dev, &orig->devices, dev_list) { |
452 | device = kzalloc(sizeof(*device), GFP_NOFS); | 398 | device = kzalloc(sizeof(*device), GFP_NOFS); |
453 | if (!device) | 399 | if (!device) |
@@ -462,7 +408,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) | |||
462 | device->devid = orig_dev->devid; | 408 | device->devid = orig_dev->devid; |
463 | device->work.func = pending_bios_fn; | 409 | device->work.func = pending_bios_fn; |
464 | memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); | 410 | memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); |
465 | device->barriers = 1; | ||
466 | spin_lock_init(&device->io_lock); | 411 | spin_lock_init(&device->io_lock); |
467 | INIT_LIST_HEAD(&device->dev_list); | 412 | INIT_LIST_HEAD(&device->dev_list); |
468 | INIT_LIST_HEAD(&device->dev_alloc_list); | 413 | INIT_LIST_HEAD(&device->dev_alloc_list); |
@@ -471,10 +416,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) | |||
471 | device->fs_devices = fs_devices; | 416 | device->fs_devices = fs_devices; |
472 | fs_devices->num_devices++; | 417 | fs_devices->num_devices++; |
473 | } | 418 | } |
474 | mutex_unlock(&orig->device_list_mutex); | ||
475 | return fs_devices; | 419 | return fs_devices; |
476 | error: | 420 | error: |
477 | mutex_unlock(&orig->device_list_mutex); | ||
478 | free_fs_devices(fs_devices); | 421 | free_fs_devices(fs_devices); |
479 | return ERR_PTR(-ENOMEM); | 422 | return ERR_PTR(-ENOMEM); |
480 | } | 423 | } |
@@ -485,13 +428,13 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) | |||
485 | 428 | ||
486 | mutex_lock(&uuid_mutex); | 429 | mutex_lock(&uuid_mutex); |
487 | again: | 430 | again: |
488 | mutex_lock(&fs_devices->device_list_mutex); | 431 | /* This is the initialized path, it is safe to release the devices. */ |
489 | list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { | 432 | list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { |
490 | if (device->in_fs_metadata) | 433 | if (device->in_fs_metadata) |
491 | continue; | 434 | continue; |
492 | 435 | ||
493 | if (device->bdev) { | 436 | if (device->bdev) { |
494 | close_bdev_exclusive(device->bdev, device->mode); | 437 | blkdev_put(device->bdev, device->mode); |
495 | device->bdev = NULL; | 438 | device->bdev = NULL; |
496 | fs_devices->open_devices--; | 439 | fs_devices->open_devices--; |
497 | } | 440 | } |
@@ -505,7 +448,6 @@ again: | |||
505 | kfree(device->name); | 448 | kfree(device->name); |
506 | kfree(device); | 449 | kfree(device); |
507 | } | 450 | } |
508 | mutex_unlock(&fs_devices->device_list_mutex); | ||
509 | 451 | ||
510 | if (fs_devices->seed) { | 452 | if (fs_devices->seed) { |
511 | fs_devices = fs_devices->seed; | 453 | fs_devices = fs_devices->seed; |
@@ -516,6 +458,29 @@ again: | |||
516 | return 0; | 458 | return 0; |
517 | } | 459 | } |
518 | 460 | ||
461 | static void __free_device(struct work_struct *work) | ||
462 | { | ||
463 | struct btrfs_device *device; | ||
464 | |||
465 | device = container_of(work, struct btrfs_device, rcu_work); | ||
466 | |||
467 | if (device->bdev) | ||
468 | blkdev_put(device->bdev, device->mode); | ||
469 | |||
470 | kfree(device->name); | ||
471 | kfree(device); | ||
472 | } | ||
473 | |||
474 | static void free_device(struct rcu_head *head) | ||
475 | { | ||
476 | struct btrfs_device *device; | ||
477 | |||
478 | device = container_of(head, struct btrfs_device, rcu); | ||
479 | |||
480 | INIT_WORK(&device->rcu_work, __free_device); | ||
481 | schedule_work(&device->rcu_work); | ||
482 | } | ||
483 | |||
519 | static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) | 484 | static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) |
520 | { | 485 | { |
521 | struct btrfs_device *device; | 486 | struct btrfs_device *device; |
@@ -523,20 +488,32 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) | |||
523 | if (--fs_devices->opened > 0) | 488 | if (--fs_devices->opened > 0) |
524 | return 0; | 489 | return 0; |
525 | 490 | ||
491 | mutex_lock(&fs_devices->device_list_mutex); | ||
526 | list_for_each_entry(device, &fs_devices->devices, dev_list) { | 492 | list_for_each_entry(device, &fs_devices->devices, dev_list) { |
527 | if (device->bdev) { | 493 | struct btrfs_device *new_device; |
528 | close_bdev_exclusive(device->bdev, device->mode); | 494 | |
495 | if (device->bdev) | ||
529 | fs_devices->open_devices--; | 496 | fs_devices->open_devices--; |
530 | } | 497 | |
531 | if (device->writeable) { | 498 | if (device->writeable) { |
532 | list_del_init(&device->dev_alloc_list); | 499 | list_del_init(&device->dev_alloc_list); |
533 | fs_devices->rw_devices--; | 500 | fs_devices->rw_devices--; |
534 | } | 501 | } |
535 | 502 | ||
536 | device->bdev = NULL; | 503 | new_device = kmalloc(sizeof(*new_device), GFP_NOFS); |
537 | device->writeable = 0; | 504 | BUG_ON(!new_device); |
538 | device->in_fs_metadata = 0; | 505 | memcpy(new_device, device, sizeof(*new_device)); |
506 | new_device->name = kstrdup(device->name, GFP_NOFS); | ||
507 | BUG_ON(device->name && !new_device->name); | ||
508 | new_device->bdev = NULL; | ||
509 | new_device->writeable = 0; | ||
510 | new_device->in_fs_metadata = 0; | ||
511 | list_replace_rcu(&device->dev_list, &new_device->dev_list); | ||
512 | |||
513 | call_rcu(&device->rcu, free_device); | ||
539 | } | 514 | } |
515 | mutex_unlock(&fs_devices->device_list_mutex); | ||
516 | |||
540 | WARN_ON(fs_devices->open_devices); | 517 | WARN_ON(fs_devices->open_devices); |
541 | WARN_ON(fs_devices->rw_devices); | 518 | WARN_ON(fs_devices->rw_devices); |
542 | fs_devices->opened = 0; | 519 | fs_devices->opened = 0; |
@@ -582,13 +559,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | |||
582 | int seeding = 1; | 559 | int seeding = 1; |
583 | int ret = 0; | 560 | int ret = 0; |
584 | 561 | ||
562 | flags |= FMODE_EXCL; | ||
563 | |||
585 | list_for_each_entry(device, head, dev_list) { | 564 | list_for_each_entry(device, head, dev_list) { |
586 | if (device->bdev) | 565 | if (device->bdev) |
587 | continue; | 566 | continue; |
588 | if (!device->name) | 567 | if (!device->name) |
589 | continue; | 568 | continue; |
590 | 569 | ||
591 | bdev = open_bdev_exclusive(device->name, flags, holder); | 570 | bdev = blkdev_get_by_path(device->name, flags, holder); |
592 | if (IS_ERR(bdev)) { | 571 | if (IS_ERR(bdev)) { |
593 | printk(KERN_INFO "open %s failed\n", device->name); | 572 | printk(KERN_INFO "open %s failed\n", device->name); |
594 | goto error; | 573 | goto error; |
@@ -596,8 +575,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | |||
596 | set_blocksize(bdev, 4096); | 575 | set_blocksize(bdev, 4096); |
597 | 576 | ||
598 | bh = btrfs_read_dev_super(bdev); | 577 | bh = btrfs_read_dev_super(bdev); |
599 | if (!bh) | 578 | if (!bh) { |
579 | ret = -EINVAL; | ||
600 | goto error_close; | 580 | goto error_close; |
581 | } | ||
601 | 582 | ||
602 | disk_super = (struct btrfs_super_block *)bh->b_data; | 583 | disk_super = (struct btrfs_super_block *)bh->b_data; |
603 | devid = btrfs_stack_device_id(&disk_super->dev_item); | 584 | devid = btrfs_stack_device_id(&disk_super->dev_item); |
@@ -635,12 +616,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | |||
635 | list_add(&device->dev_alloc_list, | 616 | list_add(&device->dev_alloc_list, |
636 | &fs_devices->alloc_list); | 617 | &fs_devices->alloc_list); |
637 | } | 618 | } |
619 | brelse(bh); | ||
638 | continue; | 620 | continue; |
639 | 621 | ||
640 | error_brelse: | 622 | error_brelse: |
641 | brelse(bh); | 623 | brelse(bh); |
642 | error_close: | 624 | error_close: |
643 | close_bdev_exclusive(bdev, FMODE_READ); | 625 | blkdev_put(bdev, flags); |
644 | error: | 626 | error: |
645 | continue; | 627 | continue; |
646 | } | 628 | } |
@@ -686,7 +668,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | |||
686 | 668 | ||
687 | mutex_lock(&uuid_mutex); | 669 | mutex_lock(&uuid_mutex); |
688 | 670 | ||
689 | bdev = open_bdev_exclusive(path, flags, holder); | 671 | flags |= FMODE_EXCL; |
672 | bdev = blkdev_get_by_path(path, flags, holder); | ||
690 | 673 | ||
691 | if (IS_ERR(bdev)) { | 674 | if (IS_ERR(bdev)) { |
692 | ret = PTR_ERR(bdev); | 675 | ret = PTR_ERR(bdev); |
@@ -698,7 +681,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | |||
698 | goto error_close; | 681 | goto error_close; |
699 | bh = btrfs_read_dev_super(bdev); | 682 | bh = btrfs_read_dev_super(bdev); |
700 | if (!bh) { | 683 | if (!bh) { |
701 | ret = -EIO; | 684 | ret = -EINVAL; |
702 | goto error_close; | 685 | goto error_close; |
703 | } | 686 | } |
704 | disk_super = (struct btrfs_super_block *)bh->b_data; | 687 | disk_super = (struct btrfs_super_block *)bh->b_data; |
@@ -706,77 +689,178 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | |||
706 | transid = btrfs_super_generation(disk_super); | 689 | transid = btrfs_super_generation(disk_super); |
707 | if (disk_super->label[0]) | 690 | if (disk_super->label[0]) |
708 | printk(KERN_INFO "device label %s ", disk_super->label); | 691 | printk(KERN_INFO "device label %s ", disk_super->label); |
709 | else { | 692 | else |
710 | /* FIXME, make a readl uuid parser */ | 693 | printk(KERN_INFO "device fsid %pU ", disk_super->fsid); |
711 | printk(KERN_INFO "device fsid %llx-%llx ", | ||
712 | *(unsigned long long *)disk_super->fsid, | ||
713 | *(unsigned long long *)(disk_super->fsid + 8)); | ||
714 | } | ||
715 | printk(KERN_CONT "devid %llu transid %llu %s\n", | 694 | printk(KERN_CONT "devid %llu transid %llu %s\n", |
716 | (unsigned long long)devid, (unsigned long long)transid, path); | 695 | (unsigned long long)devid, (unsigned long long)transid, path); |
717 | ret = device_list_add(path, disk_super, devid, fs_devices_ret); | 696 | ret = device_list_add(path, disk_super, devid, fs_devices_ret); |
718 | 697 | ||
719 | brelse(bh); | 698 | brelse(bh); |
720 | error_close: | 699 | error_close: |
721 | close_bdev_exclusive(bdev, flags); | 700 | blkdev_put(bdev, flags); |
722 | error: | 701 | error: |
723 | mutex_unlock(&uuid_mutex); | 702 | mutex_unlock(&uuid_mutex); |
724 | return ret; | 703 | return ret; |
725 | } | 704 | } |
726 | 705 | ||
706 | /* helper to account the used device space in the range */ | ||
707 | int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, | ||
708 | u64 end, u64 *length) | ||
709 | { | ||
710 | struct btrfs_key key; | ||
711 | struct btrfs_root *root = device->dev_root; | ||
712 | struct btrfs_dev_extent *dev_extent; | ||
713 | struct btrfs_path *path; | ||
714 | u64 extent_end; | ||
715 | int ret; | ||
716 | int slot; | ||
717 | struct extent_buffer *l; | ||
718 | |||
719 | *length = 0; | ||
720 | |||
721 | if (start >= device->total_bytes) | ||
722 | return 0; | ||
723 | |||
724 | path = btrfs_alloc_path(); | ||
725 | if (!path) | ||
726 | return -ENOMEM; | ||
727 | path->reada = 2; | ||
728 | |||
729 | key.objectid = device->devid; | ||
730 | key.offset = start; | ||
731 | key.type = BTRFS_DEV_EXTENT_KEY; | ||
732 | |||
733 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
734 | if (ret < 0) | ||
735 | goto out; | ||
736 | if (ret > 0) { | ||
737 | ret = btrfs_previous_item(root, path, key.objectid, key.type); | ||
738 | if (ret < 0) | ||
739 | goto out; | ||
740 | } | ||
741 | |||
742 | while (1) { | ||
743 | l = path->nodes[0]; | ||
744 | slot = path->slots[0]; | ||
745 | if (slot >= btrfs_header_nritems(l)) { | ||
746 | ret = btrfs_next_leaf(root, path); | ||
747 | if (ret == 0) | ||
748 | continue; | ||
749 | if (ret < 0) | ||
750 | goto out; | ||
751 | |||
752 | break; | ||
753 | } | ||
754 | btrfs_item_key_to_cpu(l, &key, slot); | ||
755 | |||
756 | if (key.objectid < device->devid) | ||
757 | goto next; | ||
758 | |||
759 | if (key.objectid > device->devid) | ||
760 | break; | ||
761 | |||
762 | if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) | ||
763 | goto next; | ||
764 | |||
765 | dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); | ||
766 | extent_end = key.offset + btrfs_dev_extent_length(l, | ||
767 | dev_extent); | ||
768 | if (key.offset <= start && extent_end > end) { | ||
769 | *length = end - start + 1; | ||
770 | break; | ||
771 | } else if (key.offset <= start && extent_end > start) | ||
772 | *length += extent_end - start; | ||
773 | else if (key.offset > start && extent_end <= end) | ||
774 | *length += extent_end - key.offset; | ||
775 | else if (key.offset > start && key.offset <= end) { | ||
776 | *length += end - key.offset + 1; | ||
777 | break; | ||
778 | } else if (key.offset > end) | ||
779 | break; | ||
780 | |||
781 | next: | ||
782 | path->slots[0]++; | ||
783 | } | ||
784 | ret = 0; | ||
785 | out: | ||
786 | btrfs_free_path(path); | ||
787 | return ret; | ||
788 | } | ||
789 | |||
727 | /* | 790 | /* |
791 | * find_free_dev_extent - find free space in the specified device | ||
792 | * @trans: transaction handler | ||
793 | * @device: the device which we search the free space in | ||
794 | * @num_bytes: the size of the free space that we need | ||
795 | * @start: store the start of the free space. | ||
796 | * @len: the size of the free space. that we find, or the size of the max | ||
797 | * free space if we don't find suitable free space | ||
798 | * | ||
728 | * this uses a pretty simple search, the expectation is that it is | 799 | * this uses a pretty simple search, the expectation is that it is |
729 | * called very infrequently and that a given device has a small number | 800 | * called very infrequently and that a given device has a small number |
730 | * of extents | 801 | * of extents |
802 | * | ||
803 | * @start is used to store the start of the free space if we find. But if we | ||
804 | * don't find suitable free space, it will be used to store the start position | ||
805 | * of the max free space. | ||
806 | * | ||
807 | * @len is used to store the size of the free space that we find. | ||
808 | * But if we don't find suitable free space, it is used to store the size of | ||
809 | * the max free space. | ||
731 | */ | 810 | */ |
732 | int find_free_dev_extent(struct btrfs_trans_handle *trans, | 811 | int find_free_dev_extent(struct btrfs_trans_handle *trans, |
733 | struct btrfs_device *device, u64 num_bytes, | 812 | struct btrfs_device *device, u64 num_bytes, |
734 | u64 *start, u64 *max_avail) | 813 | u64 *start, u64 *len) |
735 | { | 814 | { |
736 | struct btrfs_key key; | 815 | struct btrfs_key key; |
737 | struct btrfs_root *root = device->dev_root; | 816 | struct btrfs_root *root = device->dev_root; |
738 | struct btrfs_dev_extent *dev_extent = NULL; | 817 | struct btrfs_dev_extent *dev_extent; |
739 | struct btrfs_path *path; | 818 | struct btrfs_path *path; |
740 | u64 hole_size = 0; | 819 | u64 hole_size; |
741 | u64 last_byte = 0; | 820 | u64 max_hole_start; |
742 | u64 search_start = 0; | 821 | u64 max_hole_size; |
822 | u64 extent_end; | ||
823 | u64 search_start; | ||
743 | u64 search_end = device->total_bytes; | 824 | u64 search_end = device->total_bytes; |
744 | int ret; | 825 | int ret; |
745 | int slot = 0; | 826 | int slot; |
746 | int start_found; | ||
747 | struct extent_buffer *l; | 827 | struct extent_buffer *l; |
748 | 828 | ||
749 | path = btrfs_alloc_path(); | ||
750 | if (!path) | ||
751 | return -ENOMEM; | ||
752 | path->reada = 2; | ||
753 | start_found = 0; | ||
754 | |||
755 | /* FIXME use last free of some kind */ | 829 | /* FIXME use last free of some kind */ |
756 | 830 | ||
757 | /* we don't want to overwrite the superblock on the drive, | 831 | /* we don't want to overwrite the superblock on the drive, |
758 | * so we make sure to start at an offset of at least 1MB | 832 | * so we make sure to start at an offset of at least 1MB |
759 | */ | 833 | */ |
760 | search_start = max((u64)1024 * 1024, search_start); | 834 | search_start = max(root->fs_info->alloc_start, 1024ull * 1024); |
835 | |||
836 | max_hole_start = search_start; | ||
837 | max_hole_size = 0; | ||
761 | 838 | ||
762 | if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) | 839 | if (search_start >= search_end) { |
763 | search_start = max(root->fs_info->alloc_start, search_start); | 840 | ret = -ENOSPC; |
841 | goto error; | ||
842 | } | ||
843 | |||
844 | path = btrfs_alloc_path(); | ||
845 | if (!path) { | ||
846 | ret = -ENOMEM; | ||
847 | goto error; | ||
848 | } | ||
849 | path->reada = 2; | ||
764 | 850 | ||
765 | key.objectid = device->devid; | 851 | key.objectid = device->devid; |
766 | key.offset = search_start; | 852 | key.offset = search_start; |
767 | key.type = BTRFS_DEV_EXTENT_KEY; | 853 | key.type = BTRFS_DEV_EXTENT_KEY; |
854 | |||
768 | ret = btrfs_search_slot(trans, root, &key, path, 0, 0); | 855 | ret = btrfs_search_slot(trans, root, &key, path, 0, 0); |
769 | if (ret < 0) | 856 | if (ret < 0) |
770 | goto error; | 857 | goto out; |
771 | if (ret > 0) { | 858 | if (ret > 0) { |
772 | ret = btrfs_previous_item(root, path, key.objectid, key.type); | 859 | ret = btrfs_previous_item(root, path, key.objectid, key.type); |
773 | if (ret < 0) | 860 | if (ret < 0) |
774 | goto error; | 861 | goto out; |
775 | if (ret > 0) | ||
776 | start_found = 1; | ||
777 | } | 862 | } |
778 | l = path->nodes[0]; | 863 | |
779 | btrfs_item_key_to_cpu(l, &key, path->slots[0]); | ||
780 | while (1) { | 864 | while (1) { |
781 | l = path->nodes[0]; | 865 | l = path->nodes[0]; |
782 | slot = path->slots[0]; | 866 | slot = path->slots[0]; |
@@ -785,24 +869,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, | |||
785 | if (ret == 0) | 869 | if (ret == 0) |
786 | continue; | 870 | continue; |
787 | if (ret < 0) | 871 | if (ret < 0) |
788 | goto error; | 872 | goto out; |
789 | no_more_items: | 873 | |
790 | if (!start_found) { | 874 | break; |
791 | if (search_start >= search_end) { | ||
792 | ret = -ENOSPC; | ||
793 | goto error; | ||
794 | } | ||
795 | *start = search_start; | ||
796 | start_found = 1; | ||
797 | goto check_pending; | ||
798 | } | ||
799 | *start = last_byte > search_start ? | ||
800 | last_byte : search_start; | ||
801 | if (search_end <= *start) { | ||
802 | ret = -ENOSPC; | ||
803 | goto error; | ||
804 | } | ||
805 | goto check_pending; | ||
806 | } | 875 | } |
807 | btrfs_item_key_to_cpu(l, &key, slot); | 876 | btrfs_item_key_to_cpu(l, &key, slot); |
808 | 877 | ||
@@ -810,48 +879,62 @@ no_more_items: | |||
810 | goto next; | 879 | goto next; |
811 | 880 | ||
812 | if (key.objectid > device->devid) | 881 | if (key.objectid > device->devid) |
813 | goto no_more_items; | 882 | break; |
814 | 883 | ||
815 | if (key.offset >= search_start && key.offset > last_byte && | 884 | if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) |
816 | start_found) { | 885 | goto next; |
817 | if (last_byte < search_start) | ||
818 | last_byte = search_start; | ||
819 | hole_size = key.offset - last_byte; | ||
820 | 886 | ||
821 | if (hole_size > *max_avail) | 887 | if (key.offset > search_start) { |
822 | *max_avail = hole_size; | 888 | hole_size = key.offset - search_start; |
889 | |||
890 | if (hole_size > max_hole_size) { | ||
891 | max_hole_start = search_start; | ||
892 | max_hole_size = hole_size; | ||
893 | } | ||
823 | 894 | ||
824 | if (key.offset > last_byte && | 895 | /* |
825 | hole_size >= num_bytes) { | 896 | * If this free space is greater than which we need, |
826 | *start = last_byte; | 897 | * it must be the max free space that we have found |
827 | goto check_pending; | 898 | * until now, so max_hole_start must point to the start |
899 | * of this free space and the length of this free space | ||
900 | * is stored in max_hole_size. Thus, we return | ||
901 | * max_hole_start and max_hole_size and go back to the | ||
902 | * caller. | ||
903 | */ | ||
904 | if (hole_size >= num_bytes) { | ||
905 | ret = 0; | ||
906 | goto out; | ||
828 | } | 907 | } |
829 | } | 908 | } |
830 | if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) | ||
831 | goto next; | ||
832 | 909 | ||
833 | start_found = 1; | ||
834 | dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); | 910 | dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); |
835 | last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); | 911 | extent_end = key.offset + btrfs_dev_extent_length(l, |
912 | dev_extent); | ||
913 | if (extent_end > search_start) | ||
914 | search_start = extent_end; | ||
836 | next: | 915 | next: |
837 | path->slots[0]++; | 916 | path->slots[0]++; |
838 | cond_resched(); | 917 | cond_resched(); |
839 | } | 918 | } |
840 | check_pending: | ||
841 | /* we have to make sure we didn't find an extent that has already | ||
842 | * been allocated by the map tree or the original allocation | ||
843 | */ | ||
844 | BUG_ON(*start < search_start); | ||
845 | 919 | ||
846 | if (*start + num_bytes > search_end) { | 920 | hole_size = search_end- search_start; |
847 | ret = -ENOSPC; | 921 | if (hole_size > max_hole_size) { |
848 | goto error; | 922 | max_hole_start = search_start; |
923 | max_hole_size = hole_size; | ||
849 | } | 924 | } |
850 | /* check for pending inserts here */ | ||
851 | ret = 0; | ||
852 | 925 | ||
853 | error: | 926 | /* See above. */ |
927 | if (hole_size < num_bytes) | ||
928 | ret = -ENOSPC; | ||
929 | else | ||
930 | ret = 0; | ||
931 | |||
932 | out: | ||
854 | btrfs_free_path(path); | 933 | btrfs_free_path(path); |
934 | error: | ||
935 | *start = max_hole_start; | ||
936 | if (len) | ||
937 | *len = max_hole_size; | ||
855 | return ret; | 938 | return ret; |
856 | } | 939 | } |
857 | 940 | ||
@@ -879,14 +962,14 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, | |||
879 | if (ret > 0) { | 962 | if (ret > 0) { |
880 | ret = btrfs_previous_item(root, path, key.objectid, | 963 | ret = btrfs_previous_item(root, path, key.objectid, |
881 | BTRFS_DEV_EXTENT_KEY); | 964 | BTRFS_DEV_EXTENT_KEY); |
882 | BUG_ON(ret); | 965 | if (ret) |
966 | goto out; | ||
883 | leaf = path->nodes[0]; | 967 | leaf = path->nodes[0]; |
884 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | 968 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); |
885 | extent = btrfs_item_ptr(leaf, path->slots[0], | 969 | extent = btrfs_item_ptr(leaf, path->slots[0], |
886 | struct btrfs_dev_extent); | 970 | struct btrfs_dev_extent); |
887 | BUG_ON(found_key.offset > start || found_key.offset + | 971 | BUG_ON(found_key.offset > start || found_key.offset + |
888 | btrfs_dev_extent_length(leaf, extent) < start); | 972 | btrfs_dev_extent_length(leaf, extent) < start); |
889 | ret = 0; | ||
890 | } else if (ret == 0) { | 973 | } else if (ret == 0) { |
891 | leaf = path->nodes[0]; | 974 | leaf = path->nodes[0]; |
892 | extent = btrfs_item_ptr(leaf, path->slots[0], | 975 | extent = btrfs_item_ptr(leaf, path->slots[0], |
@@ -897,8 +980,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, | |||
897 | if (device->bytes_used > 0) | 980 | if (device->bytes_used > 0) |
898 | device->bytes_used -= btrfs_dev_extent_length(leaf, extent); | 981 | device->bytes_used -= btrfs_dev_extent_length(leaf, extent); |
899 | ret = btrfs_del_item(trans, root, path); | 982 | ret = btrfs_del_item(trans, root, path); |
900 | BUG_ON(ret); | ||
901 | 983 | ||
984 | out: | ||
902 | btrfs_free_path(path); | 985 | btrfs_free_path(path); |
903 | return ret; | 986 | return ret; |
904 | } | 987 | } |
@@ -1098,6 +1181,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, | |||
1098 | return -ENOMEM; | 1181 | return -ENOMEM; |
1099 | 1182 | ||
1100 | trans = btrfs_start_transaction(root, 0); | 1183 | trans = btrfs_start_transaction(root, 0); |
1184 | if (IS_ERR(trans)) { | ||
1185 | btrfs_free_path(path); | ||
1186 | return PTR_ERR(trans); | ||
1187 | } | ||
1101 | key.objectid = BTRFS_DEV_ITEMS_OBJECTID; | 1188 | key.objectid = BTRFS_DEV_ITEMS_OBJECTID; |
1102 | key.type = BTRFS_DEV_ITEM_KEY; | 1189 | key.type = BTRFS_DEV_ITEM_KEY; |
1103 | key.offset = device->devid; | 1190 | key.offset = device->devid; |
@@ -1129,11 +1216,13 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1129 | struct block_device *bdev; | 1216 | struct block_device *bdev; |
1130 | struct buffer_head *bh = NULL; | 1217 | struct buffer_head *bh = NULL; |
1131 | struct btrfs_super_block *disk_super; | 1218 | struct btrfs_super_block *disk_super; |
1219 | struct btrfs_fs_devices *cur_devices; | ||
1132 | u64 all_avail; | 1220 | u64 all_avail; |
1133 | u64 devid; | 1221 | u64 devid; |
1134 | u64 num_devices; | 1222 | u64 num_devices; |
1135 | u8 *dev_uuid; | 1223 | u8 *dev_uuid; |
1136 | int ret = 0; | 1224 | int ret = 0; |
1225 | bool clear_super = false; | ||
1137 | 1226 | ||
1138 | mutex_lock(&uuid_mutex); | 1227 | mutex_lock(&uuid_mutex); |
1139 | mutex_lock(&root->fs_info->volume_mutex); | 1228 | mutex_lock(&root->fs_info->volume_mutex); |
@@ -1164,14 +1253,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1164 | 1253 | ||
1165 | device = NULL; | 1254 | device = NULL; |
1166 | devices = &root->fs_info->fs_devices->devices; | 1255 | devices = &root->fs_info->fs_devices->devices; |
1167 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); | 1256 | /* |
1257 | * It is safe to read the devices since the volume_mutex | ||
1258 | * is held. | ||
1259 | */ | ||
1168 | list_for_each_entry(tmp, devices, dev_list) { | 1260 | list_for_each_entry(tmp, devices, dev_list) { |
1169 | if (tmp->in_fs_metadata && !tmp->bdev) { | 1261 | if (tmp->in_fs_metadata && !tmp->bdev) { |
1170 | device = tmp; | 1262 | device = tmp; |
1171 | break; | 1263 | break; |
1172 | } | 1264 | } |
1173 | } | 1265 | } |
1174 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | ||
1175 | bdev = NULL; | 1266 | bdev = NULL; |
1176 | bh = NULL; | 1267 | bh = NULL; |
1177 | disk_super = NULL; | 1268 | disk_super = NULL; |
@@ -1181,8 +1272,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1181 | goto out; | 1272 | goto out; |
1182 | } | 1273 | } |
1183 | } else { | 1274 | } else { |
1184 | bdev = open_bdev_exclusive(device_path, FMODE_READ, | 1275 | bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, |
1185 | root->fs_info->bdev_holder); | 1276 | root->fs_info->bdev_holder); |
1186 | if (IS_ERR(bdev)) { | 1277 | if (IS_ERR(bdev)) { |
1187 | ret = PTR_ERR(bdev); | 1278 | ret = PTR_ERR(bdev); |
1188 | goto out; | 1279 | goto out; |
@@ -1191,7 +1282,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1191 | set_blocksize(bdev, 4096); | 1282 | set_blocksize(bdev, 4096); |
1192 | bh = btrfs_read_dev_super(bdev); | 1283 | bh = btrfs_read_dev_super(bdev); |
1193 | if (!bh) { | 1284 | if (!bh) { |
1194 | ret = -EIO; | 1285 | ret = -EINVAL; |
1195 | goto error_close; | 1286 | goto error_close; |
1196 | } | 1287 | } |
1197 | disk_super = (struct btrfs_super_block *)bh->b_data; | 1288 | disk_super = (struct btrfs_super_block *)bh->b_data; |
@@ -1213,31 +1304,39 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1213 | } | 1304 | } |
1214 | 1305 | ||
1215 | if (device->writeable) { | 1306 | if (device->writeable) { |
1307 | lock_chunks(root); | ||
1216 | list_del_init(&device->dev_alloc_list); | 1308 | list_del_init(&device->dev_alloc_list); |
1309 | unlock_chunks(root); | ||
1217 | root->fs_info->fs_devices->rw_devices--; | 1310 | root->fs_info->fs_devices->rw_devices--; |
1311 | clear_super = true; | ||
1218 | } | 1312 | } |
1219 | 1313 | ||
1220 | ret = btrfs_shrink_device(device, 0); | 1314 | ret = btrfs_shrink_device(device, 0); |
1221 | if (ret) | 1315 | if (ret) |
1222 | goto error_brelse; | 1316 | goto error_undo; |
1223 | 1317 | ||
1224 | ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); | 1318 | ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); |
1225 | if (ret) | 1319 | if (ret) |
1226 | goto error_brelse; | 1320 | goto error_undo; |
1227 | 1321 | ||
1228 | device->in_fs_metadata = 0; | 1322 | device->in_fs_metadata = 0; |
1323 | btrfs_scrub_cancel_dev(root, device); | ||
1229 | 1324 | ||
1230 | /* | 1325 | /* |
1231 | * the device list mutex makes sure that we don't change | 1326 | * the device list mutex makes sure that we don't change |
1232 | * the device list while someone else is writing out all | 1327 | * the device list while someone else is writing out all |
1233 | * the device supers. | 1328 | * the device supers. |
1234 | */ | 1329 | */ |
1330 | |||
1331 | cur_devices = device->fs_devices; | ||
1235 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); | 1332 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); |
1236 | list_del_init(&device->dev_list); | 1333 | list_del_rcu(&device->dev_list); |
1237 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | ||
1238 | 1334 | ||
1239 | device->fs_devices->num_devices--; | 1335 | device->fs_devices->num_devices--; |
1240 | 1336 | ||
1337 | if (device->missing) | ||
1338 | root->fs_info->fs_devices->missing_devices--; | ||
1339 | |||
1241 | next_device = list_entry(root->fs_info->fs_devices->devices.next, | 1340 | next_device = list_entry(root->fs_info->fs_devices->devices.next, |
1242 | struct btrfs_device, dev_list); | 1341 | struct btrfs_device, dev_list); |
1243 | if (device->bdev == root->fs_info->sb->s_bdev) | 1342 | if (device->bdev == root->fs_info->sb->s_bdev) |
@@ -1245,34 +1344,36 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1245 | if (device->bdev == root->fs_info->fs_devices->latest_bdev) | 1344 | if (device->bdev == root->fs_info->fs_devices->latest_bdev) |
1246 | root->fs_info->fs_devices->latest_bdev = next_device->bdev; | 1345 | root->fs_info->fs_devices->latest_bdev = next_device->bdev; |
1247 | 1346 | ||
1248 | if (device->bdev) { | 1347 | if (device->bdev) |
1249 | close_bdev_exclusive(device->bdev, device->mode); | ||
1250 | device->bdev = NULL; | ||
1251 | device->fs_devices->open_devices--; | 1348 | device->fs_devices->open_devices--; |
1252 | } | 1349 | |
1350 | call_rcu(&device->rcu, free_device); | ||
1351 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | ||
1253 | 1352 | ||
1254 | num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; | 1353 | num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; |
1255 | btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); | 1354 | btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); |
1256 | 1355 | ||
1257 | if (device->fs_devices->open_devices == 0) { | 1356 | if (cur_devices->open_devices == 0) { |
1258 | struct btrfs_fs_devices *fs_devices; | 1357 | struct btrfs_fs_devices *fs_devices; |
1259 | fs_devices = root->fs_info->fs_devices; | 1358 | fs_devices = root->fs_info->fs_devices; |
1260 | while (fs_devices) { | 1359 | while (fs_devices) { |
1261 | if (fs_devices->seed == device->fs_devices) | 1360 | if (fs_devices->seed == cur_devices) |
1262 | break; | 1361 | break; |
1263 | fs_devices = fs_devices->seed; | 1362 | fs_devices = fs_devices->seed; |
1264 | } | 1363 | } |
1265 | fs_devices->seed = device->fs_devices->seed; | 1364 | fs_devices->seed = cur_devices->seed; |
1266 | device->fs_devices->seed = NULL; | 1365 | cur_devices->seed = NULL; |
1267 | __btrfs_close_devices(device->fs_devices); | 1366 | lock_chunks(root); |
1268 | free_fs_devices(device->fs_devices); | 1367 | __btrfs_close_devices(cur_devices); |
1368 | unlock_chunks(root); | ||
1369 | free_fs_devices(cur_devices); | ||
1269 | } | 1370 | } |
1270 | 1371 | ||
1271 | /* | 1372 | /* |
1272 | * at this point, the device is zero sized. We want to | 1373 | * at this point, the device is zero sized. We want to |
1273 | * remove it from the devices list and zero out the old super | 1374 | * remove it from the devices list and zero out the old super |
1274 | */ | 1375 | */ |
1275 | if (device->writeable) { | 1376 | if (clear_super) { |
1276 | /* make sure this device isn't detected as part of | 1377 | /* make sure this device isn't detected as part of |
1277 | * the FS anymore | 1378 | * the FS anymore |
1278 | */ | 1379 | */ |
@@ -1281,19 +1382,26 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1281 | sync_dirty_buffer(bh); | 1382 | sync_dirty_buffer(bh); |
1282 | } | 1383 | } |
1283 | 1384 | ||
1284 | kfree(device->name); | ||
1285 | kfree(device); | ||
1286 | ret = 0; | 1385 | ret = 0; |
1287 | 1386 | ||
1288 | error_brelse: | 1387 | error_brelse: |
1289 | brelse(bh); | 1388 | brelse(bh); |
1290 | error_close: | 1389 | error_close: |
1291 | if (bdev) | 1390 | if (bdev) |
1292 | close_bdev_exclusive(bdev, FMODE_READ); | 1391 | blkdev_put(bdev, FMODE_READ | FMODE_EXCL); |
1293 | out: | 1392 | out: |
1294 | mutex_unlock(&root->fs_info->volume_mutex); | 1393 | mutex_unlock(&root->fs_info->volume_mutex); |
1295 | mutex_unlock(&uuid_mutex); | 1394 | mutex_unlock(&uuid_mutex); |
1296 | return ret; | 1395 | return ret; |
1396 | error_undo: | ||
1397 | if (device->writeable) { | ||
1398 | lock_chunks(root); | ||
1399 | list_add(&device->dev_alloc_list, | ||
1400 | &root->fs_info->fs_devices->alloc_list); | ||
1401 | unlock_chunks(root); | ||
1402 | root->fs_info->fs_devices->rw_devices++; | ||
1403 | } | ||
1404 | goto error_brelse; | ||
1297 | } | 1405 | } |
1298 | 1406 | ||
1299 | /* | 1407 | /* |
@@ -1330,7 +1438,12 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, | |||
1330 | INIT_LIST_HEAD(&seed_devices->devices); | 1438 | INIT_LIST_HEAD(&seed_devices->devices); |
1331 | INIT_LIST_HEAD(&seed_devices->alloc_list); | 1439 | INIT_LIST_HEAD(&seed_devices->alloc_list); |
1332 | mutex_init(&seed_devices->device_list_mutex); | 1440 | mutex_init(&seed_devices->device_list_mutex); |
1333 | list_splice_init(&fs_devices->devices, &seed_devices->devices); | 1441 | |
1442 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); | ||
1443 | list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, | ||
1444 | synchronize_rcu); | ||
1445 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | ||
1446 | |||
1334 | list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); | 1447 | list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); |
1335 | list_for_each_entry(device, &seed_devices->devices, dev_list) { | 1448 | list_for_each_entry(device, &seed_devices->devices, dev_list) { |
1336 | device->fs_devices = seed_devices; | 1449 | device->fs_devices = seed_devices; |
@@ -1391,7 +1504,7 @@ next_slot: | |||
1391 | goto error; | 1504 | goto error; |
1392 | leaf = path->nodes[0]; | 1505 | leaf = path->nodes[0]; |
1393 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); | 1506 | btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); |
1394 | btrfs_release_path(root, path); | 1507 | btrfs_release_path(path); |
1395 | continue; | 1508 | continue; |
1396 | } | 1509 | } |
1397 | 1510 | ||
@@ -1441,7 +1554,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) | |||
1441 | if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) | 1554 | if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) |
1442 | return -EINVAL; | 1555 | return -EINVAL; |
1443 | 1556 | ||
1444 | bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); | 1557 | bdev = blkdev_get_by_path(device_path, FMODE_EXCL, |
1558 | root->fs_info->bdev_holder); | ||
1445 | if (IS_ERR(bdev)) | 1559 | if (IS_ERR(bdev)) |
1446 | return PTR_ERR(bdev); | 1560 | return PTR_ERR(bdev); |
1447 | 1561 | ||
@@ -1482,14 +1596,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) | |||
1482 | 1596 | ||
1483 | ret = find_next_devid(root, &device->devid); | 1597 | ret = find_next_devid(root, &device->devid); |
1484 | if (ret) { | 1598 | if (ret) { |
1599 | kfree(device->name); | ||
1485 | kfree(device); | 1600 | kfree(device); |
1486 | goto error; | 1601 | goto error; |
1487 | } | 1602 | } |
1488 | 1603 | ||
1489 | trans = btrfs_start_transaction(root, 0); | 1604 | trans = btrfs_start_transaction(root, 0); |
1605 | if (IS_ERR(trans)) { | ||
1606 | kfree(device->name); | ||
1607 | kfree(device); | ||
1608 | ret = PTR_ERR(trans); | ||
1609 | goto error; | ||
1610 | } | ||
1611 | |||
1490 | lock_chunks(root); | 1612 | lock_chunks(root); |
1491 | 1613 | ||
1492 | device->barriers = 1; | ||
1493 | device->writeable = 1; | 1614 | device->writeable = 1; |
1494 | device->work.func = pending_bios_fn; | 1615 | device->work.func = pending_bios_fn; |
1495 | generate_random_uuid(device->uuid); | 1616 | generate_random_uuid(device->uuid); |
@@ -1503,7 +1624,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) | |||
1503 | device->dev_root = root->fs_info->dev_root; | 1624 | device->dev_root = root->fs_info->dev_root; |
1504 | device->bdev = bdev; | 1625 | device->bdev = bdev; |
1505 | device->in_fs_metadata = 1; | 1626 | device->in_fs_metadata = 1; |
1506 | device->mode = 0; | 1627 | device->mode = FMODE_EXCL; |
1507 | set_blocksize(device->bdev, 4096); | 1628 | set_blocksize(device->bdev, 4096); |
1508 | 1629 | ||
1509 | if (seeding_dev) { | 1630 | if (seeding_dev) { |
@@ -1519,7 +1640,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) | |||
1519 | * half setup | 1640 | * half setup |
1520 | */ | 1641 | */ |
1521 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); | 1642 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); |
1522 | list_add(&device->dev_list, &root->fs_info->fs_devices->devices); | 1643 | list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); |
1523 | list_add(&device->dev_alloc_list, | 1644 | list_add(&device->dev_alloc_list, |
1524 | &root->fs_info->fs_devices->alloc_list); | 1645 | &root->fs_info->fs_devices->alloc_list); |
1525 | root->fs_info->fs_devices->num_devices++; | 1646 | root->fs_info->fs_devices->num_devices++; |
@@ -1568,7 +1689,7 @@ out: | |||
1568 | mutex_unlock(&root->fs_info->volume_mutex); | 1689 | mutex_unlock(&root->fs_info->volume_mutex); |
1569 | return ret; | 1690 | return ret; |
1570 | error: | 1691 | error: |
1571 | close_bdev_exclusive(bdev, 0); | 1692 | blkdev_put(bdev, FMODE_EXCL); |
1572 | if (seeding_dev) { | 1693 | if (seeding_dev) { |
1573 | mutex_unlock(&uuid_mutex); | 1694 | mutex_unlock(&uuid_mutex); |
1574 | up_write(&sb->s_umount); | 1695 | up_write(&sb->s_umount); |
@@ -1677,10 +1798,9 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, | |||
1677 | BUG_ON(ret); | 1798 | BUG_ON(ret); |
1678 | 1799 | ||
1679 | ret = btrfs_del_item(trans, root, path); | 1800 | ret = btrfs_del_item(trans, root, path); |
1680 | BUG_ON(ret); | ||
1681 | 1801 | ||
1682 | btrfs_free_path(path); | 1802 | btrfs_free_path(path); |
1683 | return 0; | 1803 | return ret; |
1684 | } | 1804 | } |
1685 | 1805 | ||
1686 | static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 | 1806 | static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 |
@@ -1755,7 +1875,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, | |||
1755 | return ret; | 1875 | return ret; |
1756 | 1876 | ||
1757 | trans = btrfs_start_transaction(root, 0); | 1877 | trans = btrfs_start_transaction(root, 0); |
1758 | BUG_ON(!trans); | 1878 | BUG_ON(IS_ERR(trans)); |
1759 | 1879 | ||
1760 | lock_chunks(root); | 1880 | lock_chunks(root); |
1761 | 1881 | ||
@@ -1786,6 +1906,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, | |||
1786 | 1906 | ||
1787 | BUG_ON(ret); | 1907 | BUG_ON(ret); |
1788 | 1908 | ||
1909 | trace_btrfs_chunk_free(root, map, chunk_offset, em->len); | ||
1910 | |||
1789 | if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { | 1911 | if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { |
1790 | ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); | 1912 | ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); |
1791 | BUG_ON(ret); | 1913 | BUG_ON(ret); |
@@ -1853,7 +1975,7 @@ again: | |||
1853 | chunk = btrfs_item_ptr(leaf, path->slots[0], | 1975 | chunk = btrfs_item_ptr(leaf, path->slots[0], |
1854 | struct btrfs_chunk); | 1976 | struct btrfs_chunk); |
1855 | chunk_type = btrfs_chunk_type(leaf, chunk); | 1977 | chunk_type = btrfs_chunk_type(leaf, chunk); |
1856 | btrfs_release_path(chunk_root, path); | 1978 | btrfs_release_path(path); |
1857 | 1979 | ||
1858 | if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { | 1980 | if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { |
1859 | ret = btrfs_relocate_chunk(chunk_root, chunk_tree, | 1981 | ret = btrfs_relocate_chunk(chunk_root, chunk_tree, |
@@ -1901,7 +2023,6 @@ int btrfs_balance(struct btrfs_root *dev_root) | |||
1901 | u64 size_to_free; | 2023 | u64 size_to_free; |
1902 | struct btrfs_path *path; | 2024 | struct btrfs_path *path; |
1903 | struct btrfs_key key; | 2025 | struct btrfs_key key; |
1904 | struct btrfs_chunk *chunk; | ||
1905 | struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; | 2026 | struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; |
1906 | struct btrfs_trans_handle *trans; | 2027 | struct btrfs_trans_handle *trans; |
1907 | struct btrfs_key found_key; | 2028 | struct btrfs_key found_key; |
@@ -1909,6 +2030,9 @@ int btrfs_balance(struct btrfs_root *dev_root) | |||
1909 | if (dev_root->fs_info->sb->s_flags & MS_RDONLY) | 2030 | if (dev_root->fs_info->sb->s_flags & MS_RDONLY) |
1910 | return -EROFS; | 2031 | return -EROFS; |
1911 | 2032 | ||
2033 | if (!capable(CAP_SYS_ADMIN)) | ||
2034 | return -EPERM; | ||
2035 | |||
1912 | mutex_lock(&dev_root->fs_info->volume_mutex); | 2036 | mutex_lock(&dev_root->fs_info->volume_mutex); |
1913 | dev_root = dev_root->fs_info->dev_root; | 2037 | dev_root = dev_root->fs_info->dev_root; |
1914 | 2038 | ||
@@ -1927,7 +2051,7 @@ int btrfs_balance(struct btrfs_root *dev_root) | |||
1927 | BUG_ON(ret); | 2051 | BUG_ON(ret); |
1928 | 2052 | ||
1929 | trans = btrfs_start_transaction(dev_root, 0); | 2053 | trans = btrfs_start_transaction(dev_root, 0); |
1930 | BUG_ON(!trans); | 2054 | BUG_ON(IS_ERR(trans)); |
1931 | 2055 | ||
1932 | ret = btrfs_grow_device(trans, device, old_size); | 2056 | ret = btrfs_grow_device(trans, device, old_size); |
1933 | BUG_ON(ret); | 2057 | BUG_ON(ret); |
@@ -1965,19 +2089,17 @@ int btrfs_balance(struct btrfs_root *dev_root) | |||
1965 | if (found_key.objectid != key.objectid) | 2089 | if (found_key.objectid != key.objectid) |
1966 | break; | 2090 | break; |
1967 | 2091 | ||
1968 | chunk = btrfs_item_ptr(path->nodes[0], | ||
1969 | path->slots[0], | ||
1970 | struct btrfs_chunk); | ||
1971 | /* chunk zero is special */ | 2092 | /* chunk zero is special */ |
1972 | if (found_key.offset == 0) | 2093 | if (found_key.offset == 0) |
1973 | break; | 2094 | break; |
1974 | 2095 | ||
1975 | btrfs_release_path(chunk_root, path); | 2096 | btrfs_release_path(path); |
1976 | ret = btrfs_relocate_chunk(chunk_root, | 2097 | ret = btrfs_relocate_chunk(chunk_root, |
1977 | chunk_root->root_key.objectid, | 2098 | chunk_root->root_key.objectid, |
1978 | found_key.objectid, | 2099 | found_key.objectid, |
1979 | found_key.offset); | 2100 | found_key.offset); |
1980 | BUG_ON(ret && ret != -ENOSPC); | 2101 | if (ret && ret != -ENOSPC) |
2102 | goto error; | ||
1981 | key.offset = found_key.offset - 1; | 2103 | key.offset = found_key.offset - 1; |
1982 | } | 2104 | } |
1983 | ret = 0; | 2105 | ret = 0; |
@@ -2044,7 +2166,7 @@ again: | |||
2044 | goto done; | 2166 | goto done; |
2045 | if (ret) { | 2167 | if (ret) { |
2046 | ret = 0; | 2168 | ret = 0; |
2047 | btrfs_release_path(root, path); | 2169 | btrfs_release_path(path); |
2048 | break; | 2170 | break; |
2049 | } | 2171 | } |
2050 | 2172 | ||
@@ -2053,7 +2175,7 @@ again: | |||
2053 | btrfs_item_key_to_cpu(l, &key, path->slots[0]); | 2175 | btrfs_item_key_to_cpu(l, &key, path->slots[0]); |
2054 | 2176 | ||
2055 | if (key.objectid != device->devid) { | 2177 | if (key.objectid != device->devid) { |
2056 | btrfs_release_path(root, path); | 2178 | btrfs_release_path(path); |
2057 | break; | 2179 | break; |
2058 | } | 2180 | } |
2059 | 2181 | ||
@@ -2061,14 +2183,14 @@ again: | |||
2061 | length = btrfs_dev_extent_length(l, dev_extent); | 2183 | length = btrfs_dev_extent_length(l, dev_extent); |
2062 | 2184 | ||
2063 | if (key.offset + length <= new_size) { | 2185 | if (key.offset + length <= new_size) { |
2064 | btrfs_release_path(root, path); | 2186 | btrfs_release_path(path); |
2065 | break; | 2187 | break; |
2066 | } | 2188 | } |
2067 | 2189 | ||
2068 | chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); | 2190 | chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); |
2069 | chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); | 2191 | chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); |
2070 | chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); | 2192 | chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); |
2071 | btrfs_release_path(root, path); | 2193 | btrfs_release_path(path); |
2072 | 2194 | ||
2073 | ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, | 2195 | ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, |
2074 | chunk_offset); | 2196 | chunk_offset); |
@@ -2096,6 +2218,11 @@ again: | |||
2096 | 2218 | ||
2097 | /* Shrinking succeeded, else we would be at "done". */ | 2219 | /* Shrinking succeeded, else we would be at "done". */ |
2098 | trans = btrfs_start_transaction(root, 0); | 2220 | trans = btrfs_start_transaction(root, 0); |
2221 | if (IS_ERR(trans)) { | ||
2222 | ret = PTR_ERR(trans); | ||
2223 | goto done; | ||
2224 | } | ||
2225 | |||
2099 | lock_chunks(root); | 2226 | lock_chunks(root); |
2100 | 2227 | ||
2101 | device->disk_total_bytes = new_size; | 2228 | device->disk_total_bytes = new_size; |
@@ -2139,211 +2266,243 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, | |||
2139 | return 0; | 2266 | return 0; |
2140 | } | 2267 | } |
2141 | 2268 | ||
2142 | static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, | 2269 | /* |
2143 | int num_stripes, int sub_stripes) | 2270 | * sort the devices in descending order by max_avail, total_avail |
2271 | */ | ||
2272 | static int btrfs_cmp_device_info(const void *a, const void *b) | ||
2144 | { | 2273 | { |
2145 | if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) | 2274 | const struct btrfs_device_info *di_a = a; |
2146 | return calc_size; | 2275 | const struct btrfs_device_info *di_b = b; |
2147 | else if (type & BTRFS_BLOCK_GROUP_RAID10) | 2276 | |
2148 | return calc_size * (num_stripes / sub_stripes); | 2277 | if (di_a->max_avail > di_b->max_avail) |
2149 | else | 2278 | return -1; |
2150 | return calc_size * num_stripes; | 2279 | if (di_a->max_avail < di_b->max_avail) |
2280 | return 1; | ||
2281 | if (di_a->total_avail > di_b->total_avail) | ||
2282 | return -1; | ||
2283 | if (di_a->total_avail < di_b->total_avail) | ||
2284 | return 1; | ||
2285 | return 0; | ||
2151 | } | 2286 | } |
2152 | 2287 | ||
2153 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | 2288 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, |
2154 | struct btrfs_root *extent_root, | 2289 | struct btrfs_root *extent_root, |
2155 | struct map_lookup **map_ret, | 2290 | struct map_lookup **map_ret, |
2156 | u64 *num_bytes, u64 *stripe_size, | 2291 | u64 *num_bytes_out, u64 *stripe_size_out, |
2157 | u64 start, u64 type) | 2292 | u64 start, u64 type) |
2158 | { | 2293 | { |
2159 | struct btrfs_fs_info *info = extent_root->fs_info; | 2294 | struct btrfs_fs_info *info = extent_root->fs_info; |
2160 | struct btrfs_device *device = NULL; | ||
2161 | struct btrfs_fs_devices *fs_devices = info->fs_devices; | 2295 | struct btrfs_fs_devices *fs_devices = info->fs_devices; |
2162 | struct list_head *cur; | 2296 | struct list_head *cur; |
2163 | struct map_lookup *map = NULL; | 2297 | struct map_lookup *map = NULL; |
2164 | struct extent_map_tree *em_tree; | 2298 | struct extent_map_tree *em_tree; |
2165 | struct extent_map *em; | 2299 | struct extent_map *em; |
2166 | struct list_head private_devs; | 2300 | struct btrfs_device_info *devices_info = NULL; |
2167 | int min_stripe_size = 1 * 1024 * 1024; | 2301 | u64 total_avail; |
2168 | u64 calc_size = 1024 * 1024 * 1024; | 2302 | int num_stripes; /* total number of stripes to allocate */ |
2169 | u64 max_chunk_size = calc_size; | 2303 | int sub_stripes; /* sub_stripes info for map */ |
2170 | u64 min_free; | 2304 | int dev_stripes; /* stripes per dev */ |
2171 | u64 avail; | 2305 | int devs_max; /* max devs to use */ |
2172 | u64 max_avail = 0; | 2306 | int devs_min; /* min devs needed */ |
2173 | u64 dev_offset; | 2307 | int devs_increment; /* ndevs has to be a multiple of this */ |
2174 | int num_stripes = 1; | 2308 | int ncopies; /* how many copies to data has */ |
2175 | int min_stripes = 1; | ||
2176 | int sub_stripes = 0; | ||
2177 | int looped = 0; | ||
2178 | int ret; | 2309 | int ret; |
2179 | int index; | 2310 | u64 max_stripe_size; |
2180 | int stripe_len = 64 * 1024; | 2311 | u64 max_chunk_size; |
2312 | u64 stripe_size; | ||
2313 | u64 num_bytes; | ||
2314 | int ndevs; | ||
2315 | int i; | ||
2316 | int j; | ||
2181 | 2317 | ||
2182 | if ((type & BTRFS_BLOCK_GROUP_RAID1) && | 2318 | if ((type & BTRFS_BLOCK_GROUP_RAID1) && |
2183 | (type & BTRFS_BLOCK_GROUP_DUP)) { | 2319 | (type & BTRFS_BLOCK_GROUP_DUP)) { |
2184 | WARN_ON(1); | 2320 | WARN_ON(1); |
2185 | type &= ~BTRFS_BLOCK_GROUP_DUP; | 2321 | type &= ~BTRFS_BLOCK_GROUP_DUP; |
2186 | } | 2322 | } |
2323 | |||
2187 | if (list_empty(&fs_devices->alloc_list)) | 2324 | if (list_empty(&fs_devices->alloc_list)) |
2188 | return -ENOSPC; | 2325 | return -ENOSPC; |
2189 | 2326 | ||
2190 | if (type & (BTRFS_BLOCK_GROUP_RAID0)) { | 2327 | sub_stripes = 1; |
2191 | num_stripes = fs_devices->rw_devices; | 2328 | dev_stripes = 1; |
2192 | min_stripes = 2; | 2329 | devs_increment = 1; |
2193 | } | 2330 | ncopies = 1; |
2331 | devs_max = 0; /* 0 == as many as possible */ | ||
2332 | devs_min = 1; | ||
2333 | |||
2334 | /* | ||
2335 | * define the properties of each RAID type. | ||
2336 | * FIXME: move this to a global table and use it in all RAID | ||
2337 | * calculation code | ||
2338 | */ | ||
2194 | if (type & (BTRFS_BLOCK_GROUP_DUP)) { | 2339 | if (type & (BTRFS_BLOCK_GROUP_DUP)) { |
2195 | num_stripes = 2; | 2340 | dev_stripes = 2; |
2196 | min_stripes = 2; | 2341 | ncopies = 2; |
2197 | } | 2342 | devs_max = 1; |
2198 | if (type & (BTRFS_BLOCK_GROUP_RAID1)) { | 2343 | } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { |
2199 | if (fs_devices->rw_devices < 2) | 2344 | devs_min = 2; |
2200 | return -ENOSPC; | 2345 | } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { |
2201 | num_stripes = 2; | 2346 | devs_increment = 2; |
2202 | min_stripes = 2; | 2347 | ncopies = 2; |
2203 | } | 2348 | devs_max = 2; |
2204 | if (type & (BTRFS_BLOCK_GROUP_RAID10)) { | 2349 | devs_min = 2; |
2205 | num_stripes = fs_devices->rw_devices; | 2350 | } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { |
2206 | if (num_stripes < 4) | ||
2207 | return -ENOSPC; | ||
2208 | num_stripes &= ~(u32)1; | ||
2209 | sub_stripes = 2; | 2351 | sub_stripes = 2; |
2210 | min_stripes = 4; | 2352 | devs_increment = 2; |
2353 | ncopies = 2; | ||
2354 | devs_min = 4; | ||
2355 | } else { | ||
2356 | devs_max = 1; | ||
2211 | } | 2357 | } |
2212 | 2358 | ||
2213 | if (type & BTRFS_BLOCK_GROUP_DATA) { | 2359 | if (type & BTRFS_BLOCK_GROUP_DATA) { |
2214 | max_chunk_size = 10 * calc_size; | 2360 | max_stripe_size = 1024 * 1024 * 1024; |
2215 | min_stripe_size = 64 * 1024 * 1024; | 2361 | max_chunk_size = 10 * max_stripe_size; |
2216 | } else if (type & BTRFS_BLOCK_GROUP_METADATA) { | 2362 | } else if (type & BTRFS_BLOCK_GROUP_METADATA) { |
2217 | max_chunk_size = 256 * 1024 * 1024; | 2363 | max_stripe_size = 256 * 1024 * 1024; |
2218 | min_stripe_size = 32 * 1024 * 1024; | 2364 | max_chunk_size = max_stripe_size; |
2219 | } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { | 2365 | } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { |
2220 | calc_size = 8 * 1024 * 1024; | 2366 | max_stripe_size = 8 * 1024 * 1024; |
2221 | max_chunk_size = calc_size * 2; | 2367 | max_chunk_size = 2 * max_stripe_size; |
2222 | min_stripe_size = 1 * 1024 * 1024; | 2368 | } else { |
2369 | printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", | ||
2370 | type); | ||
2371 | BUG_ON(1); | ||
2223 | } | 2372 | } |
2224 | 2373 | ||
2225 | /* we don't want a chunk larger than 10% of writeable space */ | 2374 | /* we don't want a chunk larger than 10% of writeable space */ |
2226 | max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), | 2375 | max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), |
2227 | max_chunk_size); | 2376 | max_chunk_size); |
2228 | 2377 | ||
2229 | again: | 2378 | devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, |
2230 | max_avail = 0; | 2379 | GFP_NOFS); |
2231 | if (!map || map->num_stripes != num_stripes) { | 2380 | if (!devices_info) |
2232 | kfree(map); | 2381 | return -ENOMEM; |
2233 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); | ||
2234 | if (!map) | ||
2235 | return -ENOMEM; | ||
2236 | map->num_stripes = num_stripes; | ||
2237 | } | ||
2238 | |||
2239 | if (calc_size * num_stripes > max_chunk_size) { | ||
2240 | calc_size = max_chunk_size; | ||
2241 | do_div(calc_size, num_stripes); | ||
2242 | do_div(calc_size, stripe_len); | ||
2243 | calc_size *= stripe_len; | ||
2244 | } | ||
2245 | 2382 | ||
2246 | /* we don't want tiny stripes */ | 2383 | cur = fs_devices->alloc_list.next; |
2247 | if (!looped) | ||
2248 | calc_size = max_t(u64, min_stripe_size, calc_size); | ||
2249 | 2384 | ||
2250 | /* | 2385 | /* |
2251 | * we're about to do_div by the stripe_len so lets make sure | 2386 | * in the first pass through the devices list, we gather information |
2252 | * we end up with something bigger than a stripe | 2387 | * about the available holes on each device. |
2253 | */ | 2388 | */ |
2254 | calc_size = max_t(u64, calc_size, stripe_len * 4); | 2389 | ndevs = 0; |
2390 | while (cur != &fs_devices->alloc_list) { | ||
2391 | struct btrfs_device *device; | ||
2392 | u64 max_avail; | ||
2393 | u64 dev_offset; | ||
2255 | 2394 | ||
2256 | do_div(calc_size, stripe_len); | 2395 | device = list_entry(cur, struct btrfs_device, dev_alloc_list); |
2257 | calc_size *= stripe_len; | ||
2258 | 2396 | ||
2259 | cur = fs_devices->alloc_list.next; | 2397 | cur = cur->next; |
2260 | index = 0; | ||
2261 | 2398 | ||
2262 | if (type & BTRFS_BLOCK_GROUP_DUP) | 2399 | if (!device->writeable) { |
2263 | min_free = calc_size * 2; | 2400 | printk(KERN_ERR |
2264 | else | 2401 | "btrfs: read-only device in alloc_list\n"); |
2265 | min_free = calc_size; | 2402 | WARN_ON(1); |
2403 | continue; | ||
2404 | } | ||
2266 | 2405 | ||
2267 | /* | 2406 | if (!device->in_fs_metadata) |
2268 | * we add 1MB because we never use the first 1MB of the device, unless | 2407 | continue; |
2269 | * we've looped, then we are likely allocating the maximum amount of | ||
2270 | * space left already | ||
2271 | */ | ||
2272 | if (!looped) | ||
2273 | min_free += 1024 * 1024; | ||
2274 | 2408 | ||
2275 | INIT_LIST_HEAD(&private_devs); | ||
2276 | while (index < num_stripes) { | ||
2277 | device = list_entry(cur, struct btrfs_device, dev_alloc_list); | ||
2278 | BUG_ON(!device->writeable); | ||
2279 | if (device->total_bytes > device->bytes_used) | 2409 | if (device->total_bytes > device->bytes_used) |
2280 | avail = device->total_bytes - device->bytes_used; | 2410 | total_avail = device->total_bytes - device->bytes_used; |
2281 | else | 2411 | else |
2282 | avail = 0; | 2412 | total_avail = 0; |
2283 | cur = cur->next; | 2413 | /* avail is off by max(alloc_start, 1MB), but that is the same |
2414 | * for all devices, so it doesn't hurt the sorting later on | ||
2415 | */ | ||
2284 | 2416 | ||
2285 | if (device->in_fs_metadata && avail >= min_free) { | 2417 | ret = find_free_dev_extent(trans, device, |
2286 | ret = find_free_dev_extent(trans, device, | 2418 | max_stripe_size * dev_stripes, |
2287 | min_free, &dev_offset, | 2419 | &dev_offset, &max_avail); |
2288 | &max_avail); | 2420 | if (ret && ret != -ENOSPC) |
2289 | if (ret == 0) { | 2421 | goto error; |
2290 | list_move_tail(&device->dev_alloc_list, | 2422 | |
2291 | &private_devs); | 2423 | if (ret == 0) |
2292 | map->stripes[index].dev = device; | 2424 | max_avail = max_stripe_size * dev_stripes; |
2293 | map->stripes[index].physical = dev_offset; | 2425 | |
2294 | index++; | 2426 | if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) |
2295 | if (type & BTRFS_BLOCK_GROUP_DUP) { | 2427 | continue; |
2296 | map->stripes[index].dev = device; | 2428 | |
2297 | map->stripes[index].physical = | 2429 | devices_info[ndevs].dev_offset = dev_offset; |
2298 | dev_offset + calc_size; | 2430 | devices_info[ndevs].max_avail = max_avail; |
2299 | index++; | 2431 | devices_info[ndevs].total_avail = total_avail; |
2300 | } | 2432 | devices_info[ndevs].dev = device; |
2301 | } | 2433 | ++ndevs; |
2302 | } else if (device->in_fs_metadata && avail > max_avail) | ||
2303 | max_avail = avail; | ||
2304 | if (cur == &fs_devices->alloc_list) | ||
2305 | break; | ||
2306 | } | 2434 | } |
2307 | list_splice(&private_devs, &fs_devices->alloc_list); | 2435 | |
2308 | if (index < num_stripes) { | 2436 | /* |
2309 | if (index >= min_stripes) { | 2437 | * now sort the devices by hole size / available space |
2310 | num_stripes = index; | 2438 | */ |
2311 | if (type & (BTRFS_BLOCK_GROUP_RAID10)) { | 2439 | sort(devices_info, ndevs, sizeof(struct btrfs_device_info), |
2312 | num_stripes /= sub_stripes; | 2440 | btrfs_cmp_device_info, NULL); |
2313 | num_stripes *= sub_stripes; | 2441 | |
2314 | } | 2442 | /* round down to number of usable stripes */ |
2315 | looped = 1; | 2443 | ndevs -= ndevs % devs_increment; |
2316 | goto again; | 2444 | |
2317 | } | 2445 | if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { |
2318 | if (!looped && max_avail > 0) { | 2446 | ret = -ENOSPC; |
2319 | looped = 1; | 2447 | goto error; |
2320 | calc_size = max_avail; | 2448 | } |
2321 | goto again; | 2449 | |
2450 | if (devs_max && ndevs > devs_max) | ||
2451 | ndevs = devs_max; | ||
2452 | /* | ||
2453 | * the primary goal is to maximize the number of stripes, so use as many | ||
2454 | * devices as possible, even if the stripes are not maximum sized. | ||
2455 | */ | ||
2456 | stripe_size = devices_info[ndevs-1].max_avail; | ||
2457 | num_stripes = ndevs * dev_stripes; | ||
2458 | |||
2459 | if (stripe_size * num_stripes > max_chunk_size * ncopies) { | ||
2460 | stripe_size = max_chunk_size * ncopies; | ||
2461 | do_div(stripe_size, num_stripes); | ||
2462 | } | ||
2463 | |||
2464 | do_div(stripe_size, dev_stripes); | ||
2465 | do_div(stripe_size, BTRFS_STRIPE_LEN); | ||
2466 | stripe_size *= BTRFS_STRIPE_LEN; | ||
2467 | |||
2468 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); | ||
2469 | if (!map) { | ||
2470 | ret = -ENOMEM; | ||
2471 | goto error; | ||
2472 | } | ||
2473 | map->num_stripes = num_stripes; | ||
2474 | |||
2475 | for (i = 0; i < ndevs; ++i) { | ||
2476 | for (j = 0; j < dev_stripes; ++j) { | ||
2477 | int s = i * dev_stripes + j; | ||
2478 | map->stripes[s].dev = devices_info[i].dev; | ||
2479 | map->stripes[s].physical = devices_info[i].dev_offset + | ||
2480 | j * stripe_size; | ||
2322 | } | 2481 | } |
2323 | kfree(map); | ||
2324 | return -ENOSPC; | ||
2325 | } | 2482 | } |
2326 | map->sector_size = extent_root->sectorsize; | 2483 | map->sector_size = extent_root->sectorsize; |
2327 | map->stripe_len = stripe_len; | 2484 | map->stripe_len = BTRFS_STRIPE_LEN; |
2328 | map->io_align = stripe_len; | 2485 | map->io_align = BTRFS_STRIPE_LEN; |
2329 | map->io_width = stripe_len; | 2486 | map->io_width = BTRFS_STRIPE_LEN; |
2330 | map->type = type; | 2487 | map->type = type; |
2331 | map->num_stripes = num_stripes; | ||
2332 | map->sub_stripes = sub_stripes; | 2488 | map->sub_stripes = sub_stripes; |
2333 | 2489 | ||
2334 | *map_ret = map; | 2490 | *map_ret = map; |
2335 | *stripe_size = calc_size; | 2491 | num_bytes = stripe_size * (num_stripes / ncopies); |
2336 | *num_bytes = chunk_bytes_by_type(type, calc_size, | 2492 | |
2337 | num_stripes, sub_stripes); | 2493 | *stripe_size_out = stripe_size; |
2494 | *num_bytes_out = num_bytes; | ||
2338 | 2495 | ||
2339 | em = alloc_extent_map(GFP_NOFS); | 2496 | trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); |
2497 | |||
2498 | em = alloc_extent_map(); | ||
2340 | if (!em) { | 2499 | if (!em) { |
2341 | kfree(map); | 2500 | ret = -ENOMEM; |
2342 | return -ENOMEM; | 2501 | goto error; |
2343 | } | 2502 | } |
2344 | em->bdev = (struct block_device *)map; | 2503 | em->bdev = (struct block_device *)map; |
2345 | em->start = start; | 2504 | em->start = start; |
2346 | em->len = *num_bytes; | 2505 | em->len = num_bytes; |
2347 | em->block_start = 0; | 2506 | em->block_start = 0; |
2348 | em->block_len = em->len; | 2507 | em->block_len = em->len; |
2349 | 2508 | ||
@@ -2356,23 +2515,30 @@ again: | |||
2356 | 2515 | ||
2357 | ret = btrfs_make_block_group(trans, extent_root, 0, type, | 2516 | ret = btrfs_make_block_group(trans, extent_root, 0, type, |
2358 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, | 2517 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, |
2359 | start, *num_bytes); | 2518 | start, num_bytes); |
2360 | BUG_ON(ret); | 2519 | BUG_ON(ret); |
2361 | 2520 | ||
2362 | index = 0; | 2521 | for (i = 0; i < map->num_stripes; ++i) { |
2363 | while (index < map->num_stripes) { | 2522 | struct btrfs_device *device; |
2364 | device = map->stripes[index].dev; | 2523 | u64 dev_offset; |
2365 | dev_offset = map->stripes[index].physical; | 2524 | |
2525 | device = map->stripes[i].dev; | ||
2526 | dev_offset = map->stripes[i].physical; | ||
2366 | 2527 | ||
2367 | ret = btrfs_alloc_dev_extent(trans, device, | 2528 | ret = btrfs_alloc_dev_extent(trans, device, |
2368 | info->chunk_root->root_key.objectid, | 2529 | info->chunk_root->root_key.objectid, |
2369 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, | 2530 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, |
2370 | start, dev_offset, calc_size); | 2531 | start, dev_offset, stripe_size); |
2371 | BUG_ON(ret); | 2532 | BUG_ON(ret); |
2372 | index++; | ||
2373 | } | 2533 | } |
2374 | 2534 | ||
2535 | kfree(devices_info); | ||
2375 | return 0; | 2536 | return 0; |
2537 | |||
2538 | error: | ||
2539 | kfree(map); | ||
2540 | kfree(devices_info); | ||
2541 | return ret; | ||
2376 | } | 2542 | } |
2377 | 2543 | ||
2378 | static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, | 2544 | static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, |
@@ -2438,6 +2604,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, | |||
2438 | item_size); | 2604 | item_size); |
2439 | BUG_ON(ret); | 2605 | BUG_ON(ret); |
2440 | } | 2606 | } |
2607 | |||
2441 | kfree(chunk); | 2608 | kfree(chunk); |
2442 | return 0; | 2609 | return 0; |
2443 | } | 2610 | } |
@@ -2569,7 +2736,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) | |||
2569 | 2736 | ||
2570 | void btrfs_mapping_init(struct btrfs_mapping_tree *tree) | 2737 | void btrfs_mapping_init(struct btrfs_mapping_tree *tree) |
2571 | { | 2738 | { |
2572 | extent_map_tree_init(&tree->map_tree, GFP_NOFS); | 2739 | extent_map_tree_init(&tree->map_tree); |
2573 | } | 2740 | } |
2574 | 2741 | ||
2575 | void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) | 2742 | void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) |
@@ -2635,14 +2802,17 @@ static int find_live_mirror(struct map_lookup *map, int first, int num, | |||
2635 | static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | 2802 | static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, |
2636 | u64 logical, u64 *length, | 2803 | u64 logical, u64 *length, |
2637 | struct btrfs_multi_bio **multi_ret, | 2804 | struct btrfs_multi_bio **multi_ret, |
2638 | int mirror_num, struct page *unplug_page) | 2805 | int mirror_num) |
2639 | { | 2806 | { |
2640 | struct extent_map *em; | 2807 | struct extent_map *em; |
2641 | struct map_lookup *map; | 2808 | struct map_lookup *map; |
2642 | struct extent_map_tree *em_tree = &map_tree->map_tree; | 2809 | struct extent_map_tree *em_tree = &map_tree->map_tree; |
2643 | u64 offset; | 2810 | u64 offset; |
2644 | u64 stripe_offset; | 2811 | u64 stripe_offset; |
2812 | u64 stripe_end_offset; | ||
2645 | u64 stripe_nr; | 2813 | u64 stripe_nr; |
2814 | u64 stripe_nr_orig; | ||
2815 | u64 stripe_nr_end; | ||
2646 | int stripes_allocated = 8; | 2816 | int stripes_allocated = 8; |
2647 | int stripes_required = 1; | 2817 | int stripes_required = 1; |
2648 | int stripe_index; | 2818 | int stripe_index; |
@@ -2651,7 +2821,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | |||
2651 | int max_errors = 0; | 2821 | int max_errors = 0; |
2652 | struct btrfs_multi_bio *multi = NULL; | 2822 | struct btrfs_multi_bio *multi = NULL; |
2653 | 2823 | ||
2654 | if (multi_ret && !(rw & REQ_WRITE)) | 2824 | if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) |
2655 | stripes_allocated = 1; | 2825 | stripes_allocated = 1; |
2656 | again: | 2826 | again: |
2657 | if (multi_ret) { | 2827 | if (multi_ret) { |
@@ -2667,11 +2837,6 @@ again: | |||
2667 | em = lookup_extent_mapping(em_tree, logical, *length); | 2837 | em = lookup_extent_mapping(em_tree, logical, *length); |
2668 | read_unlock(&em_tree->lock); | 2838 | read_unlock(&em_tree->lock); |
2669 | 2839 | ||
2670 | if (!em && unplug_page) { | ||
2671 | kfree(multi); | ||
2672 | return 0; | ||
2673 | } | ||
2674 | |||
2675 | if (!em) { | 2840 | if (!em) { |
2676 | printk(KERN_CRIT "unable to find logical %llu len %llu\n", | 2841 | printk(KERN_CRIT "unable to find logical %llu len %llu\n", |
2677 | (unsigned long long)logical, | 2842 | (unsigned long long)logical, |
@@ -2697,7 +2862,15 @@ again: | |||
2697 | max_errors = 1; | 2862 | max_errors = 1; |
2698 | } | 2863 | } |
2699 | } | 2864 | } |
2700 | if (multi_ret && (rw & REQ_WRITE) && | 2865 | if (rw & REQ_DISCARD) { |
2866 | if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | | ||
2867 | BTRFS_BLOCK_GROUP_RAID1 | | ||
2868 | BTRFS_BLOCK_GROUP_DUP | | ||
2869 | BTRFS_BLOCK_GROUP_RAID10)) { | ||
2870 | stripes_required = map->num_stripes; | ||
2871 | } | ||
2872 | } | ||
2873 | if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && | ||
2701 | stripes_allocated < stripes_required) { | 2874 | stripes_allocated < stripes_required) { |
2702 | stripes_allocated = map->num_stripes; | 2875 | stripes_allocated = map->num_stripes; |
2703 | free_extent_map(em); | 2876 | free_extent_map(em); |
@@ -2717,23 +2890,37 @@ again: | |||
2717 | /* stripe_offset is the offset of this block in its stripe*/ | 2890 | /* stripe_offset is the offset of this block in its stripe*/ |
2718 | stripe_offset = offset - stripe_offset; | 2891 | stripe_offset = offset - stripe_offset; |
2719 | 2892 | ||
2720 | if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | | 2893 | if (rw & REQ_DISCARD) |
2721 | BTRFS_BLOCK_GROUP_RAID10 | | 2894 | *length = min_t(u64, em->len - offset, *length); |
2722 | BTRFS_BLOCK_GROUP_DUP)) { | 2895 | else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | |
2896 | BTRFS_BLOCK_GROUP_RAID1 | | ||
2897 | BTRFS_BLOCK_GROUP_RAID10 | | ||
2898 | BTRFS_BLOCK_GROUP_DUP)) { | ||
2723 | /* we limit the length of each bio to what fits in a stripe */ | 2899 | /* we limit the length of each bio to what fits in a stripe */ |
2724 | *length = min_t(u64, em->len - offset, | 2900 | *length = min_t(u64, em->len - offset, |
2725 | map->stripe_len - stripe_offset); | 2901 | map->stripe_len - stripe_offset); |
2726 | } else { | 2902 | } else { |
2727 | *length = em->len - offset; | 2903 | *length = em->len - offset; |
2728 | } | 2904 | } |
2729 | 2905 | ||
2730 | if (!multi_ret && !unplug_page) | 2906 | if (!multi_ret) |
2731 | goto out; | 2907 | goto out; |
2732 | 2908 | ||
2733 | num_stripes = 1; | 2909 | num_stripes = 1; |
2734 | stripe_index = 0; | 2910 | stripe_index = 0; |
2735 | if (map->type & BTRFS_BLOCK_GROUP_RAID1) { | 2911 | stripe_nr_orig = stripe_nr; |
2736 | if (unplug_page || (rw & REQ_WRITE)) | 2912 | stripe_nr_end = (offset + *length + map->stripe_len - 1) & |
2913 | (~(map->stripe_len - 1)); | ||
2914 | do_div(stripe_nr_end, map->stripe_len); | ||
2915 | stripe_end_offset = stripe_nr_end * map->stripe_len - | ||
2916 | (offset + *length); | ||
2917 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | ||
2918 | if (rw & REQ_DISCARD) | ||
2919 | num_stripes = min_t(u64, map->num_stripes, | ||
2920 | stripe_nr_end - stripe_nr_orig); | ||
2921 | stripe_index = do_div(stripe_nr, map->num_stripes); | ||
2922 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { | ||
2923 | if (rw & (REQ_WRITE | REQ_DISCARD)) | ||
2737 | num_stripes = map->num_stripes; | 2924 | num_stripes = map->num_stripes; |
2738 | else if (mirror_num) | 2925 | else if (mirror_num) |
2739 | stripe_index = mirror_num - 1; | 2926 | stripe_index = mirror_num - 1; |
@@ -2744,7 +2931,7 @@ again: | |||
2744 | } | 2931 | } |
2745 | 2932 | ||
2746 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { | 2933 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { |
2747 | if (rw & REQ_WRITE) | 2934 | if (rw & (REQ_WRITE | REQ_DISCARD)) |
2748 | num_stripes = map->num_stripes; | 2935 | num_stripes = map->num_stripes; |
2749 | else if (mirror_num) | 2936 | else if (mirror_num) |
2750 | stripe_index = mirror_num - 1; | 2937 | stripe_index = mirror_num - 1; |
@@ -2755,8 +2942,12 @@ again: | |||
2755 | stripe_index = do_div(stripe_nr, factor); | 2942 | stripe_index = do_div(stripe_nr, factor); |
2756 | stripe_index *= map->sub_stripes; | 2943 | stripe_index *= map->sub_stripes; |
2757 | 2944 | ||
2758 | if (unplug_page || (rw & REQ_WRITE)) | 2945 | if (rw & REQ_WRITE) |
2759 | num_stripes = map->sub_stripes; | 2946 | num_stripes = map->sub_stripes; |
2947 | else if (rw & REQ_DISCARD) | ||
2948 | num_stripes = min_t(u64, map->sub_stripes * | ||
2949 | (stripe_nr_end - stripe_nr_orig), | ||
2950 | map->num_stripes); | ||
2760 | else if (mirror_num) | 2951 | else if (mirror_num) |
2761 | stripe_index += mirror_num - 1; | 2952 | stripe_index += mirror_num - 1; |
2762 | else { | 2953 | else { |
@@ -2774,24 +2965,101 @@ again: | |||
2774 | } | 2965 | } |
2775 | BUG_ON(stripe_index >= map->num_stripes); | 2966 | BUG_ON(stripe_index >= map->num_stripes); |
2776 | 2967 | ||
2777 | for (i = 0; i < num_stripes; i++) { | 2968 | if (rw & REQ_DISCARD) { |
2778 | if (unplug_page) { | 2969 | for (i = 0; i < num_stripes; i++) { |
2779 | struct btrfs_device *device; | ||
2780 | struct backing_dev_info *bdi; | ||
2781 | |||
2782 | device = map->stripes[stripe_index].dev; | ||
2783 | if (device->bdev) { | ||
2784 | bdi = blk_get_backing_dev_info(device->bdev); | ||
2785 | if (bdi->unplug_io_fn) | ||
2786 | bdi->unplug_io_fn(bdi, unplug_page); | ||
2787 | } | ||
2788 | } else { | ||
2789 | multi->stripes[i].physical = | 2970 | multi->stripes[i].physical = |
2790 | map->stripes[stripe_index].physical + | 2971 | map->stripes[stripe_index].physical + |
2791 | stripe_offset + stripe_nr * map->stripe_len; | 2972 | stripe_offset + stripe_nr * map->stripe_len; |
2792 | multi->stripes[i].dev = map->stripes[stripe_index].dev; | 2973 | multi->stripes[i].dev = map->stripes[stripe_index].dev; |
2974 | |||
2975 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | ||
2976 | u64 stripes; | ||
2977 | u32 last_stripe = 0; | ||
2978 | int j; | ||
2979 | |||
2980 | div_u64_rem(stripe_nr_end - 1, | ||
2981 | map->num_stripes, | ||
2982 | &last_stripe); | ||
2983 | |||
2984 | for (j = 0; j < map->num_stripes; j++) { | ||
2985 | u32 test; | ||
2986 | |||
2987 | div_u64_rem(stripe_nr_end - 1 - j, | ||
2988 | map->num_stripes, &test); | ||
2989 | if (test == stripe_index) | ||
2990 | break; | ||
2991 | } | ||
2992 | stripes = stripe_nr_end - 1 - j; | ||
2993 | do_div(stripes, map->num_stripes); | ||
2994 | multi->stripes[i].length = map->stripe_len * | ||
2995 | (stripes - stripe_nr + 1); | ||
2996 | |||
2997 | if (i == 0) { | ||
2998 | multi->stripes[i].length -= | ||
2999 | stripe_offset; | ||
3000 | stripe_offset = 0; | ||
3001 | } | ||
3002 | if (stripe_index == last_stripe) | ||
3003 | multi->stripes[i].length -= | ||
3004 | stripe_end_offset; | ||
3005 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { | ||
3006 | u64 stripes; | ||
3007 | int j; | ||
3008 | int factor = map->num_stripes / | ||
3009 | map->sub_stripes; | ||
3010 | u32 last_stripe = 0; | ||
3011 | |||
3012 | div_u64_rem(stripe_nr_end - 1, | ||
3013 | factor, &last_stripe); | ||
3014 | last_stripe *= map->sub_stripes; | ||
3015 | |||
3016 | for (j = 0; j < factor; j++) { | ||
3017 | u32 test; | ||
3018 | |||
3019 | div_u64_rem(stripe_nr_end - 1 - j, | ||
3020 | factor, &test); | ||
3021 | |||
3022 | if (test == | ||
3023 | stripe_index / map->sub_stripes) | ||
3024 | break; | ||
3025 | } | ||
3026 | stripes = stripe_nr_end - 1 - j; | ||
3027 | do_div(stripes, factor); | ||
3028 | multi->stripes[i].length = map->stripe_len * | ||
3029 | (stripes - stripe_nr + 1); | ||
3030 | |||
3031 | if (i < map->sub_stripes) { | ||
3032 | multi->stripes[i].length -= | ||
3033 | stripe_offset; | ||
3034 | if (i == map->sub_stripes - 1) | ||
3035 | stripe_offset = 0; | ||
3036 | } | ||
3037 | if (stripe_index >= last_stripe && | ||
3038 | stripe_index <= (last_stripe + | ||
3039 | map->sub_stripes - 1)) { | ||
3040 | multi->stripes[i].length -= | ||
3041 | stripe_end_offset; | ||
3042 | } | ||
3043 | } else | ||
3044 | multi->stripes[i].length = *length; | ||
3045 | |||
3046 | stripe_index++; | ||
3047 | if (stripe_index == map->num_stripes) { | ||
3048 | /* This could only happen for RAID0/10 */ | ||
3049 | stripe_index = 0; | ||
3050 | stripe_nr++; | ||
3051 | } | ||
3052 | } | ||
3053 | } else { | ||
3054 | for (i = 0; i < num_stripes; i++) { | ||
3055 | multi->stripes[i].physical = | ||
3056 | map->stripes[stripe_index].physical + | ||
3057 | stripe_offset + | ||
3058 | stripe_nr * map->stripe_len; | ||
3059 | multi->stripes[i].dev = | ||
3060 | map->stripes[stripe_index].dev; | ||
3061 | stripe_index++; | ||
2793 | } | 3062 | } |
2794 | stripe_index++; | ||
2795 | } | 3063 | } |
2796 | if (multi_ret) { | 3064 | if (multi_ret) { |
2797 | *multi_ret = multi; | 3065 | *multi_ret = multi; |
@@ -2808,7 +3076,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | |||
2808 | struct btrfs_multi_bio **multi_ret, int mirror_num) | 3076 | struct btrfs_multi_bio **multi_ret, int mirror_num) |
2809 | { | 3077 | { |
2810 | return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, | 3078 | return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, |
2811 | mirror_num, NULL); | 3079 | mirror_num); |
2812 | } | 3080 | } |
2813 | 3081 | ||
2814 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | 3082 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, |
@@ -2876,14 +3144,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
2876 | return 0; | 3144 | return 0; |
2877 | } | 3145 | } |
2878 | 3146 | ||
2879 | int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, | ||
2880 | u64 logical, struct page *page) | ||
2881 | { | ||
2882 | u64 length = PAGE_CACHE_SIZE; | ||
2883 | return __btrfs_map_block(map_tree, READ, logical, &length, | ||
2884 | NULL, 0, page); | ||
2885 | } | ||
2886 | |||
2887 | static void end_bio_multi_stripe(struct bio *bio, int err) | 3147 | static void end_bio_multi_stripe(struct bio *bio, int err) |
2888 | { | 3148 | { |
2889 | struct btrfs_multi_bio *multi = bio->bi_private; | 3149 | struct btrfs_multi_bio *multi = bio->bi_private; |
@@ -3034,8 +3294,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
3034 | } | 3294 | } |
3035 | bio->bi_sector = multi->stripes[dev_nr].physical >> 9; | 3295 | bio->bi_sector = multi->stripes[dev_nr].physical >> 9; |
3036 | dev = multi->stripes[dev_nr].dev; | 3296 | dev = multi->stripes[dev_nr].dev; |
3037 | BUG_ON(rw == WRITE && !dev->writeable); | 3297 | if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { |
3038 | if (dev && dev->bdev) { | ||
3039 | bio->bi_bdev = dev->bdev; | 3298 | bio->bi_bdev = dev->bdev; |
3040 | if (async_submit) | 3299 | if (async_submit) |
3041 | schedule_bio(root, dev, rw, bio); | 3300 | schedule_bio(root, dev, rw, bio); |
@@ -3084,12 +3343,13 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root, | |||
3084 | return NULL; | 3343 | return NULL; |
3085 | list_add(&device->dev_list, | 3344 | list_add(&device->dev_list, |
3086 | &fs_devices->devices); | 3345 | &fs_devices->devices); |
3087 | device->barriers = 1; | ||
3088 | device->dev_root = root->fs_info->dev_root; | 3346 | device->dev_root = root->fs_info->dev_root; |
3089 | device->devid = devid; | 3347 | device->devid = devid; |
3090 | device->work.func = pending_bios_fn; | 3348 | device->work.func = pending_bios_fn; |
3091 | device->fs_devices = fs_devices; | 3349 | device->fs_devices = fs_devices; |
3350 | device->missing = 1; | ||
3092 | fs_devices->num_devices++; | 3351 | fs_devices->num_devices++; |
3352 | fs_devices->missing_devices++; | ||
3093 | spin_lock_init(&device->io_lock); | 3353 | spin_lock_init(&device->io_lock); |
3094 | INIT_LIST_HEAD(&device->dev_alloc_list); | 3354 | INIT_LIST_HEAD(&device->dev_alloc_list); |
3095 | memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); | 3355 | memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); |
@@ -3126,7 +3386,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, | |||
3126 | free_extent_map(em); | 3386 | free_extent_map(em); |
3127 | } | 3387 | } |
3128 | 3388 | ||
3129 | em = alloc_extent_map(GFP_NOFS); | 3389 | em = alloc_extent_map(); |
3130 | if (!em) | 3390 | if (!em) |
3131 | return -ENOMEM; | 3391 | return -ENOMEM; |
3132 | num_stripes = btrfs_chunk_num_stripes(leaf, chunk); | 3392 | num_stripes = btrfs_chunk_num_stripes(leaf, chunk); |
@@ -3287,6 +3547,15 @@ static int read_one_dev(struct btrfs_root *root, | |||
3287 | device = add_missing_dev(root, devid, dev_uuid); | 3547 | device = add_missing_dev(root, devid, dev_uuid); |
3288 | if (!device) | 3548 | if (!device) |
3289 | return -ENOMEM; | 3549 | return -ENOMEM; |
3550 | } else if (!device->missing) { | ||
3551 | /* | ||
3552 | * this happens when a device that was properly setup | ||
3553 | * in the device info lists suddenly goes bad. | ||
3554 | * device->bdev is NULL, and so we have to set | ||
3555 | * device->missing to one here | ||
3556 | */ | ||
3557 | root->fs_info->fs_devices->missing_devices++; | ||
3558 | device->missing = 1; | ||
3290 | } | 3559 | } |
3291 | } | 3560 | } |
3292 | 3561 | ||
@@ -3306,15 +3575,6 @@ static int read_one_dev(struct btrfs_root *root, | |||
3306 | return ret; | 3575 | return ret; |
3307 | } | 3576 | } |
3308 | 3577 | ||
3309 | int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) | ||
3310 | { | ||
3311 | struct btrfs_dev_item *dev_item; | ||
3312 | |||
3313 | dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, | ||
3314 | dev_item); | ||
3315 | return read_one_dev(root, buf, dev_item); | ||
3316 | } | ||
3317 | |||
3318 | int btrfs_read_sys_array(struct btrfs_root *root) | 3578 | int btrfs_read_sys_array(struct btrfs_root *root) |
3319 | { | 3579 | { |
3320 | struct btrfs_super_block *super_copy = &root->fs_info->super_copy; | 3580 | struct btrfs_super_block *super_copy = &root->fs_info->super_copy; |
@@ -3431,7 +3691,7 @@ again: | |||
3431 | } | 3691 | } |
3432 | if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { | 3692 | if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { |
3433 | key.objectid = 0; | 3693 | key.objectid = 0; |
3434 | btrfs_release_path(root, path); | 3694 | btrfs_release_path(path); |
3435 | goto again; | 3695 | goto again; |
3436 | } | 3696 | } |
3437 | ret = 0; | 3697 | ret = 0; |