aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /fs/btrfs/disk-io.c
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c958
1 files changed, 724 insertions, 234 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 64f10082f048..1ac8db5dc0a3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -28,6 +28,9 @@
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/migrate.h>
32#include <linux/ratelimit.h>
33#include <asm/unaligned.h>
31#include "compat.h" 34#include "compat.h"
32#include "ctree.h" 35#include "ctree.h"
33#include "disk-io.h" 36#include "disk-io.h"
@@ -39,10 +42,25 @@
39#include "locking.h" 42#include "locking.h"
40#include "tree-log.h" 43#include "tree-log.h"
41#include "free-space-cache.h" 44#include "free-space-cache.h"
45#include "inode-map.h"
42 46
43static struct extent_io_ops btree_extent_io_ops; 47static struct extent_io_ops btree_extent_io_ops;
44static void end_workqueue_fn(struct btrfs_work *work); 48static void end_workqueue_fn(struct btrfs_work *work);
45static void free_fs_root(struct btrfs_root *root); 49static void free_fs_root(struct btrfs_root *root);
50static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
51 int read_only);
52static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
53static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
54static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
55 struct btrfs_root *root);
56static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
57static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
58static int btrfs_destroy_marked_extents(struct btrfs_root *root,
59 struct extent_io_tree *dirty_pages,
60 int mark);
61static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
62 struct extent_io_tree *pinned_extents);
63static int btrfs_cleanup_transaction(struct btrfs_root *root);
46 64
47/* 65/*
48 * end_io_wq structs are used to do processing in task context when an IO is 66 * end_io_wq structs are used to do processing in task context when an IO is
@@ -121,7 +139,7 @@ static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
121 * that covers the entire device 139 * that covers the entire device
122 */ 140 */
123static struct extent_map *btree_get_extent(struct inode *inode, 141static struct extent_map *btree_get_extent(struct inode *inode,
124 struct page *page, size_t page_offset, u64 start, u64 len, 142 struct page *page, size_t pg_offset, u64 start, u64 len,
125 int create) 143 int create)
126{ 144{
127 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 145 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
@@ -138,7 +156,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
138 } 156 }
139 read_unlock(&em_tree->lock); 157 read_unlock(&em_tree->lock);
140 158
141 em = alloc_extent_map(GFP_NOFS); 159 em = alloc_extent_map();
142 if (!em) { 160 if (!em) {
143 em = ERR_PTR(-ENOMEM); 161 em = ERR_PTR(-ENOMEM);
144 goto out; 162 goto out;
@@ -183,7 +201,7 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
183 201
184void btrfs_csum_final(u32 crc, char *result) 202void btrfs_csum_final(u32 crc, char *result)
185{ 203{
186 *(__le32 *)result = ~cpu_to_le32(crc); 204 put_unaligned_le32(~crc, result);
187} 205}
188 206
189/* 207/*
@@ -238,14 +256,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
238 memcpy(&found, result, csum_size); 256 memcpy(&found, result, csum_size);
239 257
240 read_extent_buffer(buf, &val, 0, csum_size); 258 read_extent_buffer(buf, &val, 0, csum_size);
241 if (printk_ratelimit()) { 259 printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
242 printk(KERN_INFO "btrfs: %s checksum verify "
243 "failed on %llu wanted %X found %X " 260 "failed on %llu wanted %X found %X "
244 "level %d\n", 261 "level %d\n",
245 root->fs_info->sb->s_id, 262 root->fs_info->sb->s_id,
246 (unsigned long long)buf->start, val, found, 263 (unsigned long long)buf->start, val, found,
247 btrfs_header_level(buf)); 264 btrfs_header_level(buf));
248 }
249 if (result != (char *)&inline_result) 265 if (result != (char *)&inline_result)
250 kfree(result); 266 kfree(result);
251 return 1; 267 return 1;
@@ -280,13 +296,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
280 ret = 0; 296 ret = 0;
281 goto out; 297 goto out;
282 } 298 }
283 if (printk_ratelimit()) { 299 printk_ratelimited("parent transid verify failed on %llu wanted %llu "
284 printk("parent transid verify failed on %llu wanted %llu "
285 "found %llu\n", 300 "found %llu\n",
286 (unsigned long long)eb->start, 301 (unsigned long long)eb->start,
287 (unsigned long long)parent_transid, 302 (unsigned long long)parent_transid,
288 (unsigned long long)btrfs_header_generation(eb)); 303 (unsigned long long)btrfs_header_generation(eb));
289 }
290 ret = 1; 304 ret = 1;
291 clear_extent_buffer_uptodate(io_tree, eb, &cached_state); 305 clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
292out: 306out:
@@ -308,6 +322,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
308 int num_copies = 0; 322 int num_copies = 0;
309 int mirror_num = 0; 323 int mirror_num = 0;
310 324
325 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
311 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 326 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
312 while (1) { 327 while (1) {
313 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 328 ret = read_extent_buffer_pages(io_tree, eb, start, 1,
@@ -316,6 +331,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
316 !verify_parent_transid(io_tree, eb, parent_transid)) 331 !verify_parent_transid(io_tree, eb, parent_transid))
317 return ret; 332 return ret;
318 333
334 /*
335 * This buffer's crc is fine, but its contents are corrupted, so
336 * there is no reason to read the other copies, they won't be
337 * any less wrong.
338 */
339 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
340 return ret;
341
319 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 342 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
320 eb->start, eb->len); 343 eb->start, eb->len);
321 if (num_copies == 1) 344 if (num_copies == 1)
@@ -338,24 +361,33 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
338 struct extent_io_tree *tree; 361 struct extent_io_tree *tree;
339 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 362 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
340 u64 found_start; 363 u64 found_start;
341 int found_level;
342 unsigned long len; 364 unsigned long len;
343 struct extent_buffer *eb; 365 struct extent_buffer *eb;
344 int ret; 366 int ret;
345 367
346 tree = &BTRFS_I(page->mapping->host)->io_tree; 368 tree = &BTRFS_I(page->mapping->host)->io_tree;
347 369
348 if (page->private == EXTENT_PAGE_PRIVATE) 370 if (page->private == EXTENT_PAGE_PRIVATE) {
371 WARN_ON(1);
349 goto out; 372 goto out;
350 if (!page->private) 373 }
374 if (!page->private) {
375 WARN_ON(1);
351 goto out; 376 goto out;
377 }
352 len = page->private >> 2; 378 len = page->private >> 2;
353 WARN_ON(len == 0); 379 WARN_ON(len == 0);
354 380
355 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 381 eb = alloc_extent_buffer(tree, start, len, page);
382 if (eb == NULL) {
383 WARN_ON(1);
384 goto out;
385 }
356 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, 386 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
357 btrfs_header_generation(eb)); 387 btrfs_header_generation(eb));
358 BUG_ON(ret); 388 BUG_ON(ret);
389 WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
390
359 found_start = btrfs_header_bytenr(eb); 391 found_start = btrfs_header_bytenr(eb);
360 if (found_start != start) { 392 if (found_start != start) {
361 WARN_ON(1); 393 WARN_ON(1);
@@ -369,8 +401,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
369 WARN_ON(1); 401 WARN_ON(1);
370 goto err; 402 goto err;
371 } 403 }
372 found_level = btrfs_header_level(eb);
373
374 csum_tree_block(root, eb, 0); 404 csum_tree_block(root, eb, 0);
375err: 405err:
376 free_extent_buffer(eb); 406 free_extent_buffer(eb);
@@ -397,6 +427,73 @@ static int check_tree_block_fsid(struct btrfs_root *root,
397 return ret; 427 return ret;
398} 428}
399 429
430#define CORRUPT(reason, eb, root, slot) \
431 printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
432 "root=%llu, slot=%d\n", reason, \
433 (unsigned long long)btrfs_header_bytenr(eb), \
434 (unsigned long long)root->objectid, slot)
435
436static noinline int check_leaf(struct btrfs_root *root,
437 struct extent_buffer *leaf)
438{
439 struct btrfs_key key;
440 struct btrfs_key leaf_key;
441 u32 nritems = btrfs_header_nritems(leaf);
442 int slot;
443
444 if (nritems == 0)
445 return 0;
446
447 /* Check the 0 item */
448 if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
449 BTRFS_LEAF_DATA_SIZE(root)) {
450 CORRUPT("invalid item offset size pair", leaf, root, 0);
451 return -EIO;
452 }
453
454 /*
455 * Check to make sure each items keys are in the correct order and their
456 * offsets make sense. We only have to loop through nritems-1 because
457 * we check the current slot against the next slot, which verifies the
458 * next slot's offset+size makes sense and that the current's slot
459 * offset is correct.
460 */
461 for (slot = 0; slot < nritems - 1; slot++) {
462 btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
463 btrfs_item_key_to_cpu(leaf, &key, slot + 1);
464
465 /* Make sure the keys are in the right order */
466 if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
467 CORRUPT("bad key order", leaf, root, slot);
468 return -EIO;
469 }
470
471 /*
472 * Make sure the offset and ends are right, remember that the
473 * item data starts at the end of the leaf and grows towards the
474 * front.
475 */
476 if (btrfs_item_offset_nr(leaf, slot) !=
477 btrfs_item_end_nr(leaf, slot + 1)) {
478 CORRUPT("slot offset bad", leaf, root, slot);
479 return -EIO;
480 }
481
482 /*
483 * Check to make sure that we don't point outside of the leaf,
484 * just incase all the items are consistent to eachother, but
485 * all point outside of the leaf.
486 */
487 if (btrfs_item_end_nr(leaf, slot) >
488 BTRFS_LEAF_DATA_SIZE(root)) {
489 CORRUPT("slot end outside of leaf", leaf, root, slot);
490 return -EIO;
491 }
492 }
493
494 return 0;
495}
496
400#ifdef CONFIG_DEBUG_LOCK_ALLOC 497#ifdef CONFIG_DEBUG_LOCK_ALLOC
401void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) 498void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
402{ 499{
@@ -426,16 +523,18 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
426 len = page->private >> 2; 523 len = page->private >> 2;
427 WARN_ON(len == 0); 524 WARN_ON(len == 0);
428 525
429 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 526 eb = alloc_extent_buffer(tree, start, len, page);
527 if (eb == NULL) {
528 ret = -EIO;
529 goto out;
530 }
430 531
431 found_start = btrfs_header_bytenr(eb); 532 found_start = btrfs_header_bytenr(eb);
432 if (found_start != start) { 533 if (found_start != start) {
433 if (printk_ratelimit()) { 534 printk_ratelimited(KERN_INFO "btrfs bad tree block start "
434 printk(KERN_INFO "btrfs bad tree block start "
435 "%llu %llu\n", 535 "%llu %llu\n",
436 (unsigned long long)found_start, 536 (unsigned long long)found_start,
437 (unsigned long long)eb->start); 537 (unsigned long long)eb->start);
438 }
439 ret = -EIO; 538 ret = -EIO;
440 goto err; 539 goto err;
441 } 540 }
@@ -447,10 +546,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
447 goto err; 546 goto err;
448 } 547 }
449 if (check_tree_block_fsid(root, eb)) { 548 if (check_tree_block_fsid(root, eb)) {
450 if (printk_ratelimit()) { 549 printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
451 printk(KERN_INFO "btrfs bad fsid on block %llu\n",
452 (unsigned long long)eb->start); 550 (unsigned long long)eb->start);
453 }
454 ret = -EIO; 551 ret = -EIO;
455 goto err; 552 goto err;
456 } 553 }
@@ -459,8 +556,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
459 btrfs_set_buffer_lockdep_class(eb, found_level); 556 btrfs_set_buffer_lockdep_class(eb, found_level);
460 557
461 ret = csum_tree_block(root, eb, 1); 558 ret = csum_tree_block(root, eb, 1);
462 if (ret) 559 if (ret) {
463 ret = -EIO; 560 ret = -EIO;
561 goto err;
562 }
563
564 /*
565 * If this is a leaf block and it is corrupt, set the corrupt bit so
566 * that we don't try and read the other copies of this block, just
567 * return -EIO.
568 */
569 if (found_level == 0 && check_leaf(root, eb)) {
570 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
571 ret = -EIO;
572 }
464 573
465 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 574 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
466 end = eb->start + end - 1; 575 end = eb->start + end - 1;
@@ -481,9 +590,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
481 end_io_wq->work.flags = 0; 590 end_io_wq->work.flags = 0;
482 591
483 if (bio->bi_rw & REQ_WRITE) { 592 if (bio->bi_rw & REQ_WRITE) {
484 if (end_io_wq->metadata) 593 if (end_io_wq->metadata == 1)
485 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 594 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
486 &end_io_wq->work); 595 &end_io_wq->work);
596 else if (end_io_wq->metadata == 2)
597 btrfs_queue_worker(&fs_info->endio_freespace_worker,
598 &end_io_wq->work);
487 else 599 else
488 btrfs_queue_worker(&fs_info->endio_write_workers, 600 btrfs_queue_worker(&fs_info->endio_write_workers,
489 &end_io_wq->work); 601 &end_io_wq->work);
@@ -497,6 +609,13 @@ static void end_workqueue_bio(struct bio *bio, int err)
497 } 609 }
498} 610}
499 611
612/*
613 * For the metadata arg you want
614 *
615 * 0 - if data
616 * 1 - if normal metadta
617 * 2 - if writing to the free space cache area
618 */
500int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 619int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
501 int metadata) 620 int metadata)
502{ 621{
@@ -525,19 +644,11 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
525 return 256 * limit; 644 return 256 * limit;
526} 645}
527 646
528int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
529{
530 return atomic_read(&info->nr_async_bios) >
531 btrfs_async_submit_limit(info);
532}
533
534static void run_one_async_start(struct btrfs_work *work) 647static void run_one_async_start(struct btrfs_work *work)
535{ 648{
536 struct btrfs_fs_info *fs_info;
537 struct async_submit_bio *async; 649 struct async_submit_bio *async;
538 650
539 async = container_of(work, struct async_submit_bio, work); 651 async = container_of(work, struct async_submit_bio, work);
540 fs_info = BTRFS_I(async->inode)->root->fs_info;
541 async->submit_bio_start(async->inode, async->rw, async->bio, 652 async->submit_bio_start(async->inode, async->rw, async->bio,
542 async->mirror_num, async->bio_flags, 653 async->mirror_num, async->bio_flags,
543 async->bio_offset); 654 async->bio_offset);
@@ -688,6 +799,27 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
688 __btree_submit_bio_done); 799 __btree_submit_bio_done);
689} 800}
690 801
802#ifdef CONFIG_MIGRATION
803static int btree_migratepage(struct address_space *mapping,
804 struct page *newpage, struct page *page)
805{
806 /*
807 * we can't safely write a btree page from here,
808 * we haven't done the locking hook
809 */
810 if (PageDirty(page))
811 return -EAGAIN;
812 /*
813 * Buffers may be managed in a filesystem specific way.
814 * We must have no buffers or drop them.
815 */
816 if (page_has_private(page) &&
817 !try_to_release_page(page, GFP_KERNEL))
818 return -EAGAIN;
819 return migrate_page(mapping, newpage, page);
820}
821#endif
822
691static int btree_writepage(struct page *page, struct writeback_control *wbc) 823static int btree_writepage(struct page *page, struct writeback_control *wbc)
692{ 824{
693 struct extent_io_tree *tree; 825 struct extent_io_tree *tree;
@@ -702,8 +834,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
702 } 834 }
703 835
704 redirty_page_for_writepage(wbc, page); 836 redirty_page_for_writepage(wbc, page);
705 eb = btrfs_find_tree_block(root, page_offset(page), 837 eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
706 PAGE_CACHE_SIZE);
707 WARN_ON(!eb); 838 WARN_ON(!eb);
708 839
709 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 840 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
@@ -793,7 +924,9 @@ static const struct address_space_operations btree_aops = {
793 .writepages = btree_writepages, 924 .writepages = btree_writepages,
794 .releasepage = btree_releasepage, 925 .releasepage = btree_releasepage,
795 .invalidatepage = btree_invalidatepage, 926 .invalidatepage = btree_invalidatepage,
796 .sync_page = block_sync_page, 927#ifdef CONFIG_MIGRATION
928 .migratepage = btree_migratepage,
929#endif
797}; 930};
798 931
799int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 932int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -818,7 +951,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
818 struct inode *btree_inode = root->fs_info->btree_inode; 951 struct inode *btree_inode = root->fs_info->btree_inode;
819 struct extent_buffer *eb; 952 struct extent_buffer *eb;
820 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, 953 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
821 bytenr, blocksize, GFP_NOFS); 954 bytenr, blocksize);
822 return eb; 955 return eb;
823} 956}
824 957
@@ -829,7 +962,7 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
829 struct extent_buffer *eb; 962 struct extent_buffer *eb;
830 963
831 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, 964 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
832 bytenr, blocksize, NULL, GFP_NOFS); 965 bytenr, blocksize, NULL);
833 return eb; 966 return eb;
834} 967}
835 968
@@ -850,12 +983,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
850 u32 blocksize, u64 parent_transid) 983 u32 blocksize, u64 parent_transid)
851{ 984{
852 struct extent_buffer *buf = NULL; 985 struct extent_buffer *buf = NULL;
853 struct inode *btree_inode = root->fs_info->btree_inode;
854 struct extent_io_tree *io_tree;
855 int ret; 986 int ret;
856 987
857 io_tree = &BTRFS_I(btree_inode)->io_tree;
858
859 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 988 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
860 if (!buf) 989 if (!buf)
861 return NULL; 990 return NULL;
@@ -915,15 +1044,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
915 root->last_trans = 0; 1044 root->last_trans = 0;
916 root->highest_objectid = 0; 1045 root->highest_objectid = 0;
917 root->name = NULL; 1046 root->name = NULL;
918 root->in_sysfs = 0;
919 root->inode_tree = RB_ROOT; 1047 root->inode_tree = RB_ROOT;
1048 INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
920 root->block_rsv = NULL; 1049 root->block_rsv = NULL;
921 root->orphan_block_rsv = NULL; 1050 root->orphan_block_rsv = NULL;
922 1051
923 INIT_LIST_HEAD(&root->dirty_list); 1052 INIT_LIST_HEAD(&root->dirty_list);
924 INIT_LIST_HEAD(&root->orphan_list); 1053 INIT_LIST_HEAD(&root->orphan_list);
925 INIT_LIST_HEAD(&root->root_list); 1054 INIT_LIST_HEAD(&root->root_list);
926 spin_lock_init(&root->node_lock);
927 spin_lock_init(&root->orphan_lock); 1055 spin_lock_init(&root->orphan_lock);
928 spin_lock_init(&root->inode_lock); 1056 spin_lock_init(&root->inode_lock);
929 spin_lock_init(&root->accounting_lock); 1057 spin_lock_init(&root->accounting_lock);
@@ -939,7 +1067,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
939 root->log_transid = 0; 1067 root->log_transid = 0;
940 root->last_log_commit = 0; 1068 root->last_log_commit = 0;
941 extent_io_tree_init(&root->dirty_log_pages, 1069 extent_io_tree_init(&root->dirty_log_pages,
942 fs_info->btree_inode->i_mapping, GFP_NOFS); 1070 fs_info->btree_inode->i_mapping);
943 1071
944 memset(&root->root_key, 0, sizeof(root->root_key)); 1072 memset(&root->root_key, 0, sizeof(root->root_key));
945 memset(&root->root_item, 0, sizeof(root->root_item)); 1073 memset(&root->root_item, 0, sizeof(root->root_item));
@@ -980,7 +1108,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
980 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1108 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
981 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1109 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
982 blocksize, generation); 1110 blocksize, generation);
983 BUG_ON(!root->node); 1111 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1112 free_extent_buffer(root->node);
1113 return -EIO;
1114 }
984 root->commit_root = btrfs_root_node(root); 1115 root->commit_root = btrfs_root_node(root);
985 return 0; 1116 return 0;
986} 1117}
@@ -1104,7 +1235,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1104 root, fs_info, location->objectid); 1235 root, fs_info, location->objectid);
1105 1236
1106 path = btrfs_alloc_path(); 1237 path = btrfs_alloc_path();
1107 BUG_ON(!path); 1238 if (!path) {
1239 kfree(root);
1240 return ERR_PTR(-ENOMEM);
1241 }
1108 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); 1242 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1109 if (ret == 0) { 1243 if (ret == 0) {
1110 l = path->nodes[0]; 1244 l = path->nodes[0];
@@ -1115,6 +1249,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1115 } 1249 }
1116 btrfs_free_path(path); 1250 btrfs_free_path(path);
1117 if (ret) { 1251 if (ret) {
1252 kfree(root);
1118 if (ret > 0) 1253 if (ret > 0)
1119 ret = -ENOENT; 1254 ret = -ENOENT;
1120 return ERR_PTR(ret); 1255 return ERR_PTR(ret);
@@ -1127,27 +1262,14 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1127 root->commit_root = btrfs_root_node(root); 1262 root->commit_root = btrfs_root_node(root);
1128 BUG_ON(!root->node); 1263 BUG_ON(!root->node);
1129out: 1264out:
1130 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) 1265 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1131 root->ref_cows = 1; 1266 root->ref_cows = 1;
1267 btrfs_check_and_init_root_item(&root->root_item);
1268 }
1132 1269
1133 return root; 1270 return root;
1134} 1271}
1135 1272
1136struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1137 u64 root_objectid)
1138{
1139 struct btrfs_root *root;
1140
1141 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
1142 return fs_info->tree_root;
1143 if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
1144 return fs_info->extent_root;
1145
1146 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1147 (unsigned long)root_objectid);
1148 return root;
1149}
1150
1151struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 1273struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1152 struct btrfs_key *location) 1274 struct btrfs_key *location)
1153{ 1275{
@@ -1176,7 +1298,22 @@ again:
1176 if (IS_ERR(root)) 1298 if (IS_ERR(root))
1177 return root; 1299 return root;
1178 1300
1179 set_anon_super(&root->anon_super, NULL); 1301 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1302 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
1303 GFP_NOFS);
1304 if (!root->free_ino_pinned || !root->free_ino_ctl) {
1305 ret = -ENOMEM;
1306 goto fail;
1307 }
1308
1309 btrfs_init_free_ino_ctl(root);
1310 mutex_init(&root->fs_commit_mutex);
1311 spin_lock_init(&root->cache_lock);
1312 init_waitqueue_head(&root->cache_wait);
1313
1314 ret = set_anon_super(&root->anon_super, NULL);
1315 if (ret)
1316 goto fail;
1180 1317
1181 if (btrfs_root_refs(&root->root_item) == 0) { 1318 if (btrfs_root_refs(&root->root_item) == 0) {
1182 ret = -ENOENT; 1319 ret = -ENOENT;
@@ -1219,41 +1356,6 @@ fail:
1219 return ERR_PTR(ret); 1356 return ERR_PTR(ret);
1220} 1357}
1221 1358
1222struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1223 struct btrfs_key *location,
1224 const char *name, int namelen)
1225{
1226 return btrfs_read_fs_root_no_name(fs_info, location);
1227#if 0
1228 struct btrfs_root *root;
1229 int ret;
1230
1231 root = btrfs_read_fs_root_no_name(fs_info, location);
1232 if (!root)
1233 return NULL;
1234
1235 if (root->in_sysfs)
1236 return root;
1237
1238 ret = btrfs_set_root_name(root, name, namelen);
1239 if (ret) {
1240 free_extent_buffer(root->node);
1241 kfree(root);
1242 return ERR_PTR(ret);
1243 }
1244
1245 ret = btrfs_sysfs_add_root(root);
1246 if (ret) {
1247 free_extent_buffer(root->node);
1248 kfree(root->name);
1249 kfree(root);
1250 return ERR_PTR(ret);
1251 }
1252 root->in_sysfs = 1;
1253 return root;
1254#endif
1255}
1256
1257static int btrfs_congested_fn(void *congested_data, int bdi_bits) 1359static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1258{ 1360{
1259 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; 1361 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
@@ -1261,7 +1363,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1261 struct btrfs_device *device; 1363 struct btrfs_device *device;
1262 struct backing_dev_info *bdi; 1364 struct backing_dev_info *bdi;
1263 1365
1264 list_for_each_entry(device, &info->fs_devices->devices, dev_list) { 1366 rcu_read_lock();
1367 list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
1265 if (!device->bdev) 1368 if (!device->bdev)
1266 continue; 1369 continue;
1267 bdi = blk_get_backing_dev_info(device->bdev); 1370 bdi = blk_get_backing_dev_info(device->bdev);
@@ -1270,86 +1373,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1270 break; 1373 break;
1271 } 1374 }
1272 } 1375 }
1376 rcu_read_unlock();
1273 return ret; 1377 return ret;
1274} 1378}
1275 1379
1276/* 1380/*
1277 * this unplugs every device on the box, and it is only used when page
1278 * is null
1279 */
1280static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1281{
1282 struct btrfs_device *device;
1283 struct btrfs_fs_info *info;
1284
1285 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1286 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1287 if (!device->bdev)
1288 continue;
1289
1290 bdi = blk_get_backing_dev_info(device->bdev);
1291 if (bdi->unplug_io_fn)
1292 bdi->unplug_io_fn(bdi, page);
1293 }
1294}
1295
1296static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1297{
1298 struct inode *inode;
1299 struct extent_map_tree *em_tree;
1300 struct extent_map *em;
1301 struct address_space *mapping;
1302 u64 offset;
1303
1304 /* the generic O_DIRECT read code does this */
1305 if (1 || !page) {
1306 __unplug_io_fn(bdi, page);
1307 return;
1308 }
1309
1310 /*
1311 * page->mapping may change at any time. Get a consistent copy
1312 * and use that for everything below
1313 */
1314 smp_mb();
1315 mapping = page->mapping;
1316 if (!mapping)
1317 return;
1318
1319 inode = mapping->host;
1320
1321 /*
1322 * don't do the expensive searching for a small number of
1323 * devices
1324 */
1325 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1326 __unplug_io_fn(bdi, page);
1327 return;
1328 }
1329
1330 offset = page_offset(page);
1331
1332 em_tree = &BTRFS_I(inode)->extent_tree;
1333 read_lock(&em_tree->lock);
1334 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1335 read_unlock(&em_tree->lock);
1336 if (!em) {
1337 __unplug_io_fn(bdi, page);
1338 return;
1339 }
1340
1341 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1342 free_extent_map(em);
1343 __unplug_io_fn(bdi, page);
1344 return;
1345 }
1346 offset = offset - em->start;
1347 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1348 em->block_start + offset, page);
1349 free_extent_map(em);
1350}
1351
1352/*
1353 * If this fails, caller must call bdi_destroy() to get rid of the 1381 * If this fails, caller must call bdi_destroy() to get rid of the
1354 * bdi again. 1382 * bdi again.
1355 */ 1383 */
@@ -1363,8 +1391,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1363 return err; 1391 return err;
1364 1392
1365 bdi->ra_pages = default_backing_dev_info.ra_pages; 1393 bdi->ra_pages = default_backing_dev_info.ra_pages;
1366 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1367 bdi->unplug_io_data = info;
1368 bdi->congested_fn = btrfs_congested_fn; 1394 bdi->congested_fn = btrfs_congested_fn;
1369 bdi->congested_data = info; 1395 bdi->congested_data = info;
1370 return 0; 1396 return 0;
@@ -1377,7 +1403,6 @@ static int bio_ready_for_csum(struct bio *bio)
1377 u64 start = 0; 1403 u64 start = 0;
1378 struct page *page; 1404 struct page *page;
1379 struct extent_io_tree *io_tree = NULL; 1405 struct extent_io_tree *io_tree = NULL;
1380 struct btrfs_fs_info *info = NULL;
1381 struct bio_vec *bvec; 1406 struct bio_vec *bvec;
1382 int i; 1407 int i;
1383 int ret; 1408 int ret;
@@ -1396,7 +1421,6 @@ static int bio_ready_for_csum(struct bio *bio)
1396 buf_len = page->private >> 2; 1421 buf_len = page->private >> 2;
1397 start = page_offset(page) + bvec->bv_offset; 1422 start = page_offset(page) + bvec->bv_offset;
1398 io_tree = &BTRFS_I(page->mapping->host)->io_tree; 1423 io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1399 info = BTRFS_I(page->mapping->host)->root->fs_info;
1400 } 1424 }
1401 /* are we fully contained in this bio? */ 1425 /* are we fully contained in this bio? */
1402 if (buf_len <= length) 1426 if (buf_len <= length)
@@ -1452,6 +1476,7 @@ static int cleaner_kthread(void *arg)
1452 btrfs_run_delayed_iputs(root); 1476 btrfs_run_delayed_iputs(root);
1453 btrfs_clean_old_snapshots(root); 1477 btrfs_clean_old_snapshots(root);
1454 mutex_unlock(&root->fs_info->cleaner_mutex); 1478 mutex_unlock(&root->fs_info->cleaner_mutex);
1479 btrfs_run_defrag_inodes(root->fs_info);
1455 } 1480 }
1456 1481
1457 if (freezing(current)) { 1482 if (freezing(current)) {
@@ -1481,24 +1506,25 @@ static int transaction_kthread(void *arg)
1481 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1506 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1482 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1507 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1483 1508
1484 spin_lock(&root->fs_info->new_trans_lock); 1509 spin_lock(&root->fs_info->trans_lock);
1485 cur = root->fs_info->running_transaction; 1510 cur = root->fs_info->running_transaction;
1486 if (!cur) { 1511 if (!cur) {
1487 spin_unlock(&root->fs_info->new_trans_lock); 1512 spin_unlock(&root->fs_info->trans_lock);
1488 goto sleep; 1513 goto sleep;
1489 } 1514 }
1490 1515
1491 now = get_seconds(); 1516 now = get_seconds();
1492 if (!cur->blocked && 1517 if (!cur->blocked &&
1493 (now < cur->start_time || now - cur->start_time < 30)) { 1518 (now < cur->start_time || now - cur->start_time < 30)) {
1494 spin_unlock(&root->fs_info->new_trans_lock); 1519 spin_unlock(&root->fs_info->trans_lock);
1495 delay = HZ * 5; 1520 delay = HZ * 5;
1496 goto sleep; 1521 goto sleep;
1497 } 1522 }
1498 transid = cur->transid; 1523 transid = cur->transid;
1499 spin_unlock(&root->fs_info->new_trans_lock); 1524 spin_unlock(&root->fs_info->trans_lock);
1500 1525
1501 trans = btrfs_join_transaction(root, 1); 1526 trans = btrfs_join_transaction(root);
1527 BUG_ON(IS_ERR(trans));
1502 if (transid == trans->transid) { 1528 if (transid == trans->transid) {
1503 ret = btrfs_commit_transaction(trans, root); 1529 ret = btrfs_commit_transaction(trans, root);
1504 BUG_ON(ret); 1530 BUG_ON(ret);
@@ -1539,10 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1539 GFP_NOFS); 1565 GFP_NOFS);
1540 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), 1566 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1541 GFP_NOFS); 1567 GFP_NOFS);
1542 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), 1568 struct btrfs_root *tree_root = btrfs_sb(sb);
1543 GFP_NOFS); 1569 struct btrfs_fs_info *fs_info = NULL;
1544 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1545 GFP_NOFS);
1546 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), 1570 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1547 GFP_NOFS); 1571 GFP_NOFS);
1548 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), 1572 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
@@ -1554,11 +1578,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1554 1578
1555 struct btrfs_super_block *disk_super; 1579 struct btrfs_super_block *disk_super;
1556 1580
1557 if (!extent_root || !tree_root || !fs_info || 1581 if (!extent_root || !tree_root || !tree_root->fs_info ||
1558 !chunk_root || !dev_root || !csum_root) { 1582 !chunk_root || !dev_root || !csum_root) {
1559 err = -ENOMEM; 1583 err = -ENOMEM;
1560 goto fail; 1584 goto fail;
1561 } 1585 }
1586 fs_info = tree_root->fs_info;
1562 1587
1563 ret = init_srcu_struct(&fs_info->subvol_srcu); 1588 ret = init_srcu_struct(&fs_info->subvol_srcu);
1564 if (ret) { 1589 if (ret) {
@@ -1578,6 +1603,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1578 goto fail_bdi; 1603 goto fail_bdi;
1579 } 1604 }
1580 1605
1606 fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
1607
1581 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 1608 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1582 INIT_LIST_HEAD(&fs_info->trans_list); 1609 INIT_LIST_HEAD(&fs_info->trans_list);
1583 INIT_LIST_HEAD(&fs_info->dead_roots); 1610 INIT_LIST_HEAD(&fs_info->dead_roots);
@@ -1587,10 +1614,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1587 INIT_LIST_HEAD(&fs_info->ordered_operations); 1614 INIT_LIST_HEAD(&fs_info->ordered_operations);
1588 INIT_LIST_HEAD(&fs_info->caching_block_groups); 1615 INIT_LIST_HEAD(&fs_info->caching_block_groups);
1589 spin_lock_init(&fs_info->delalloc_lock); 1616 spin_lock_init(&fs_info->delalloc_lock);
1590 spin_lock_init(&fs_info->new_trans_lock); 1617 spin_lock_init(&fs_info->trans_lock);
1591 spin_lock_init(&fs_info->ref_cache_lock); 1618 spin_lock_init(&fs_info->ref_cache_lock);
1592 spin_lock_init(&fs_info->fs_roots_radix_lock); 1619 spin_lock_init(&fs_info->fs_roots_radix_lock);
1593 spin_lock_init(&fs_info->delayed_iput_lock); 1620 spin_lock_init(&fs_info->delayed_iput_lock);
1621 spin_lock_init(&fs_info->defrag_inodes_lock);
1622 mutex_init(&fs_info->reloc_mutex);
1594 1623
1595 init_completion(&fs_info->kobj_unregister); 1624 init_completion(&fs_info->kobj_unregister);
1596 fs_info->tree_root = tree_root; 1625 fs_info->tree_root = tree_root;
@@ -1613,15 +1642,34 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1613 atomic_set(&fs_info->async_delalloc_pages, 0); 1642 atomic_set(&fs_info->async_delalloc_pages, 0);
1614 atomic_set(&fs_info->async_submit_draining, 0); 1643 atomic_set(&fs_info->async_submit_draining, 0);
1615 atomic_set(&fs_info->nr_async_bios, 0); 1644 atomic_set(&fs_info->nr_async_bios, 0);
1645 atomic_set(&fs_info->defrag_running, 0);
1616 fs_info->sb = sb; 1646 fs_info->sb = sb;
1617 fs_info->max_inline = 8192 * 1024; 1647 fs_info->max_inline = 8192 * 1024;
1618 fs_info->metadata_ratio = 0; 1648 fs_info->metadata_ratio = 0;
1649 fs_info->defrag_inodes = RB_ROOT;
1650 fs_info->trans_no_join = 0;
1619 1651
1620 fs_info->thread_pool_size = min_t(unsigned long, 1652 fs_info->thread_pool_size = min_t(unsigned long,
1621 num_online_cpus() + 2, 8); 1653 num_online_cpus() + 2, 8);
1622 1654
1623 INIT_LIST_HEAD(&fs_info->ordered_extents); 1655 INIT_LIST_HEAD(&fs_info->ordered_extents);
1624 spin_lock_init(&fs_info->ordered_extent_lock); 1656 spin_lock_init(&fs_info->ordered_extent_lock);
1657 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
1658 GFP_NOFS);
1659 if (!fs_info->delayed_root) {
1660 err = -ENOMEM;
1661 goto fail_iput;
1662 }
1663 btrfs_init_delayed_root(fs_info->delayed_root);
1664
1665 mutex_init(&fs_info->scrub_lock);
1666 atomic_set(&fs_info->scrubs_running, 0);
1667 atomic_set(&fs_info->scrub_pause_req, 0);
1668 atomic_set(&fs_info->scrubs_paused, 0);
1669 atomic_set(&fs_info->scrub_cancel_req, 0);
1670 init_waitqueue_head(&fs_info->scrub_pause_wait);
1671 init_rwsem(&fs_info->scrub_super_lock);
1672 fs_info->scrub_workers_refcnt = 0;
1625 1673
1626 sb->s_blocksize = 4096; 1674 sb->s_blocksize = 4096;
1627 sb->s_blocksize_bits = blksize_bits(4096); 1675 sb->s_blocksize_bits = blksize_bits(4096);
@@ -1640,10 +1688,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1640 1688
1641 RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); 1689 RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
1642 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, 1690 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1643 fs_info->btree_inode->i_mapping, 1691 fs_info->btree_inode->i_mapping);
1644 GFP_NOFS); 1692 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
1645 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
1646 GFP_NOFS);
1647 1693
1648 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; 1694 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1649 1695
@@ -1657,14 +1703,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1657 fs_info->block_group_cache_tree = RB_ROOT; 1703 fs_info->block_group_cache_tree = RB_ROOT;
1658 1704
1659 extent_io_tree_init(&fs_info->freed_extents[0], 1705 extent_io_tree_init(&fs_info->freed_extents[0],
1660 fs_info->btree_inode->i_mapping, GFP_NOFS); 1706 fs_info->btree_inode->i_mapping);
1661 extent_io_tree_init(&fs_info->freed_extents[1], 1707 extent_io_tree_init(&fs_info->freed_extents[1],
1662 fs_info->btree_inode->i_mapping, GFP_NOFS); 1708 fs_info->btree_inode->i_mapping);
1663 fs_info->pinned_extents = &fs_info->freed_extents[0]; 1709 fs_info->pinned_extents = &fs_info->freed_extents[0];
1664 fs_info->do_barriers = 1; 1710 fs_info->do_barriers = 1;
1665 1711
1666 1712
1667 mutex_init(&fs_info->trans_mutex);
1668 mutex_init(&fs_info->ordered_operations_mutex); 1713 mutex_init(&fs_info->ordered_operations_mutex);
1669 mutex_init(&fs_info->tree_log_mutex); 1714 mutex_init(&fs_info->tree_log_mutex);
1670 mutex_init(&fs_info->chunk_mutex); 1715 mutex_init(&fs_info->chunk_mutex);
@@ -1680,15 +1725,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1680 1725
1681 init_waitqueue_head(&fs_info->transaction_throttle); 1726 init_waitqueue_head(&fs_info->transaction_throttle);
1682 init_waitqueue_head(&fs_info->transaction_wait); 1727 init_waitqueue_head(&fs_info->transaction_wait);
1728 init_waitqueue_head(&fs_info->transaction_blocked_wait);
1683 init_waitqueue_head(&fs_info->async_submit_wait); 1729 init_waitqueue_head(&fs_info->async_submit_wait);
1684 1730
1685 __setup_root(4096, 4096, 4096, 4096, tree_root, 1731 __setup_root(4096, 4096, 4096, 4096, tree_root,
1686 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1732 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1687 1733
1688
1689 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1734 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1690 if (!bh) 1735 if (!bh) {
1691 goto fail_iput; 1736 err = -EINVAL;
1737 goto fail_alloc;
1738 }
1692 1739
1693 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 1740 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1694 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 1741 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1699,12 +1746,23 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1699 1746
1700 disk_super = &fs_info->super_copy; 1747 disk_super = &fs_info->super_copy;
1701 if (!btrfs_super_root(disk_super)) 1748 if (!btrfs_super_root(disk_super))
1702 goto fail_iput; 1749 goto fail_alloc;
1750
1751 /* check FS state, whether FS is broken. */
1752 fs_info->fs_state |= btrfs_super_flags(disk_super);
1753
1754 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1755
1756 /*
1757 * In the long term, we'll store the compression type in the super
1758 * block, and it'll be used for per file compression control.
1759 */
1760 fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
1703 1761
1704 ret = btrfs_parse_options(tree_root, options); 1762 ret = btrfs_parse_options(tree_root, options);
1705 if (ret) { 1763 if (ret) {
1706 err = ret; 1764 err = ret;
1707 goto fail_iput; 1765 goto fail_alloc;
1708 } 1766 }
1709 1767
1710 features = btrfs_super_incompat_flags(disk_super) & 1768 features = btrfs_super_incompat_flags(disk_super) &
@@ -1714,14 +1772,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1714 "unsupported optional features (%Lx).\n", 1772 "unsupported optional features (%Lx).\n",
1715 (unsigned long long)features); 1773 (unsigned long long)features);
1716 err = -EINVAL; 1774 err = -EINVAL;
1717 goto fail_iput; 1775 goto fail_alloc;
1718 } 1776 }
1719 1777
1720 features = btrfs_super_incompat_flags(disk_super); 1778 features = btrfs_super_incompat_flags(disk_super);
1721 if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { 1779 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
1722 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; 1780 if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
1723 btrfs_set_super_incompat_flags(disk_super, features); 1781 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1724 } 1782 btrfs_set_super_incompat_flags(disk_super, features);
1725 1783
1726 features = btrfs_super_compat_ro_flags(disk_super) & 1784 features = btrfs_super_compat_ro_flags(disk_super) &
1727 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 1785 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1730,7 +1788,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1730 "unsupported option features (%Lx).\n", 1788 "unsupported option features (%Lx).\n",
1731 (unsigned long long)features); 1789 (unsigned long long)features);
1732 err = -EINVAL; 1790 err = -EINVAL;
1733 goto fail_iput; 1791 goto fail_alloc;
1734 } 1792 }
1735 1793
1736 btrfs_init_workers(&fs_info->generic_worker, 1794 btrfs_init_workers(&fs_info->generic_worker,
@@ -1775,6 +1833,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1775 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 1833 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1776 fs_info->thread_pool_size, 1834 fs_info->thread_pool_size,
1777 &fs_info->generic_worker); 1835 &fs_info->generic_worker);
1836 btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
1837 1, &fs_info->generic_worker);
1838 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
1839 fs_info->thread_pool_size,
1840 &fs_info->generic_worker);
1778 1841
1779 /* 1842 /*
1780 * endios are largely parallel and should have a very 1843 * endios are largely parallel and should have a very
@@ -1795,6 +1858,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1795 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1858 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1796 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1859 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1797 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1860 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1861 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1862 btrfs_start_workers(&fs_info->delayed_workers, 1);
1798 1863
1799 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1864 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1800 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1865 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1903,6 +1968,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1903 fs_info->metadata_alloc_profile = (u64)-1; 1968 fs_info->metadata_alloc_profile = (u64)-1;
1904 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1969 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1905 1970
1971 ret = btrfs_init_space_info(fs_info);
1972 if (ret) {
1973 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
1974 goto fail_block_groups;
1975 }
1976
1906 ret = btrfs_read_block_groups(extent_root); 1977 ret = btrfs_read_block_groups(extent_root);
1907 if (ret) { 1978 if (ret) {
1908 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 1979 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
@@ -1928,7 +1999,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1928 btrfs_set_opt(fs_info->mount_opt, SSD); 1999 btrfs_set_opt(fs_info->mount_opt, SSD);
1929 } 2000 }
1930 2001
1931 if (btrfs_super_log_root(disk_super) != 0) { 2002 /* do not make disk changes in broken FS */
2003 if (btrfs_super_log_root(disk_super) != 0 &&
2004 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
1932 u64 bytenr = btrfs_super_log_root(disk_super); 2005 u64 bytenr = btrfs_super_log_root(disk_super);
1933 2006
1934 if (fs_devices->rw_devices == 0) { 2007 if (fs_devices->rw_devices == 0) {
@@ -1992,8 +2065,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1992 2065
1993 if (!(sb->s_flags & MS_RDONLY)) { 2066 if (!(sb->s_flags & MS_RDONLY)) {
1994 down_read(&fs_info->cleanup_work_sem); 2067 down_read(&fs_info->cleanup_work_sem);
1995 btrfs_orphan_cleanup(fs_info->fs_root); 2068 err = btrfs_orphan_cleanup(fs_info->fs_root);
2069 if (!err)
2070 err = btrfs_orphan_cleanup(fs_info->tree_root);
1996 up_read(&fs_info->cleanup_work_sem); 2071 up_read(&fs_info->cleanup_work_sem);
2072 if (err) {
2073 close_ctree(tree_root);
2074 return ERR_PTR(err);
2075 }
1997 } 2076 }
1998 2077
1999 return tree_root; 2078 return tree_root;
@@ -2035,7 +2114,11 @@ fail_sb_buffer:
2035 btrfs_stop_workers(&fs_info->endio_meta_workers); 2114 btrfs_stop_workers(&fs_info->endio_meta_workers);
2036 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2115 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2037 btrfs_stop_workers(&fs_info->endio_write_workers); 2116 btrfs_stop_workers(&fs_info->endio_write_workers);
2117 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2038 btrfs_stop_workers(&fs_info->submit_workers); 2118 btrfs_stop_workers(&fs_info->submit_workers);
2119 btrfs_stop_workers(&fs_info->delayed_workers);
2120fail_alloc:
2121 kfree(fs_info->delayed_root);
2039fail_iput: 2122fail_iput:
2040 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2123 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2041 iput(fs_info->btree_inode); 2124 iput(fs_info->btree_inode);
@@ -2063,11 +2146,9 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
2063 if (uptodate) { 2146 if (uptodate) {
2064 set_buffer_uptodate(bh); 2147 set_buffer_uptodate(bh);
2065 } else { 2148 } else {
2066 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { 2149 printk_ratelimited(KERN_WARNING "lost page write due to "
2067 printk(KERN_WARNING "lost page write due to "
2068 "I/O error on %s\n", 2150 "I/O error on %s\n",
2069 bdevname(bh->b_bdev, b)); 2151 bdevname(bh->b_bdev, b));
2070 }
2071 /* note, we dont' set_buffer_write_io_error because we have 2152 /* note, we dont' set_buffer_write_io_error because we have
2072 * our own ways of dealing with the IO errors 2153 * our own ways of dealing with the IO errors
2073 */ 2154 */
@@ -2200,21 +2281,10 @@ static int write_dev_supers(struct btrfs_device *device,
2200 bh->b_end_io = btrfs_end_buffer_write_sync; 2281 bh->b_end_io = btrfs_end_buffer_write_sync;
2201 } 2282 }
2202 2283
2203 if (i == last_barrier && do_barriers && device->barriers) { 2284 if (i == last_barrier && do_barriers)
2204 ret = submit_bh(WRITE_BARRIER, bh); 2285 ret = submit_bh(WRITE_FLUSH_FUA, bh);
2205 if (ret == -EOPNOTSUPP) { 2286 else
2206 printk("btrfs: disabling barriers on dev %s\n",
2207 device->name);
2208 set_buffer_uptodate(bh);
2209 device->barriers = 0;
2210 /* one reference for submit_bh */
2211 get_bh(bh);
2212 lock_buffer(bh);
2213 ret = submit_bh(WRITE_SYNC, bh);
2214 }
2215 } else {
2216 ret = submit_bh(WRITE_SYNC, bh); 2287 ret = submit_bh(WRITE_SYNC, bh);
2217 }
2218 2288
2219 if (ret) 2289 if (ret)
2220 errors++; 2290 errors++;
@@ -2242,7 +2312,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2242 2312
2243 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2313 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2244 head = &root->fs_info->fs_devices->devices; 2314 head = &root->fs_info->fs_devices->devices;
2245 list_for_each_entry(dev, head, dev_list) { 2315 list_for_each_entry_rcu(dev, head, dev_list) {
2246 if (!dev->bdev) { 2316 if (!dev->bdev) {
2247 total_errors++; 2317 total_errors++;
2248 continue; 2318 continue;
@@ -2275,7 +2345,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2275 } 2345 }
2276 2346
2277 total_errors = 0; 2347 total_errors = 0;
2278 list_for_each_entry(dev, head, dev_list) { 2348 list_for_each_entry_rcu(dev, head, dev_list) {
2279 if (!dev->bdev) 2349 if (!dev->bdev)
2280 continue; 2350 continue;
2281 if (!dev->in_fs_metadata || !dev->writeable) 2351 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2313,12 +2383,15 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2313 if (btrfs_root_refs(&root->root_item) == 0) 2383 if (btrfs_root_refs(&root->root_item) == 0)
2314 synchronize_srcu(&fs_info->subvol_srcu); 2384 synchronize_srcu(&fs_info->subvol_srcu);
2315 2385
2386 __btrfs_remove_free_space_cache(root->free_ino_pinned);
2387 __btrfs_remove_free_space_cache(root->free_ino_ctl);
2316 free_fs_root(root); 2388 free_fs_root(root);
2317 return 0; 2389 return 0;
2318} 2390}
2319 2391
2320static void free_fs_root(struct btrfs_root *root) 2392static void free_fs_root(struct btrfs_root *root)
2321{ 2393{
2394 iput(root->cache_inode);
2322 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); 2395 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2323 if (root->anon_super.s_dev) { 2396 if (root->anon_super.s_dev) {
2324 down_write(&root->anon_super.s_umount); 2397 down_write(&root->anon_super.s_umount);
@@ -2326,6 +2399,8 @@ static void free_fs_root(struct btrfs_root *root)
2326 } 2399 }
2327 free_extent_buffer(root->node); 2400 free_extent_buffer(root->node);
2328 free_extent_buffer(root->commit_root); 2401 free_extent_buffer(root->commit_root);
2402 kfree(root->free_ino_ctl);
2403 kfree(root->free_ino_pinned);
2329 kfree(root->name); 2404 kfree(root->name);
2330 kfree(root); 2405 kfree(root);
2331} 2406}
@@ -2378,8 +2453,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2378 2453
2379 root_objectid = gang[ret - 1]->root_key.objectid + 1; 2454 root_objectid = gang[ret - 1]->root_key.objectid + 1;
2380 for (i = 0; i < ret; i++) { 2455 for (i = 0; i < ret; i++) {
2456 int err;
2457
2381 root_objectid = gang[i]->root_key.objectid; 2458 root_objectid = gang[i]->root_key.objectid;
2382 btrfs_orphan_cleanup(gang[i]); 2459 err = btrfs_orphan_cleanup(gang[i]);
2460 if (err)
2461 return err;
2383 } 2462 }
2384 root_objectid++; 2463 root_objectid++;
2385 } 2464 }
@@ -2400,11 +2479,15 @@ int btrfs_commit_super(struct btrfs_root *root)
2400 down_write(&root->fs_info->cleanup_work_sem); 2479 down_write(&root->fs_info->cleanup_work_sem);
2401 up_write(&root->fs_info->cleanup_work_sem); 2480 up_write(&root->fs_info->cleanup_work_sem);
2402 2481
2403 trans = btrfs_join_transaction(root, 1); 2482 trans = btrfs_join_transaction(root);
2483 if (IS_ERR(trans))
2484 return PTR_ERR(trans);
2404 ret = btrfs_commit_transaction(trans, root); 2485 ret = btrfs_commit_transaction(trans, root);
2405 BUG_ON(ret); 2486 BUG_ON(ret);
2406 /* run commit again to drop the original snapshot */ 2487 /* run commit again to drop the original snapshot */
2407 trans = btrfs_join_transaction(root, 1); 2488 trans = btrfs_join_transaction(root);
2489 if (IS_ERR(trans))
2490 return PTR_ERR(trans);
2408 btrfs_commit_transaction(trans, root); 2491 btrfs_commit_transaction(trans, root);
2409 ret = btrfs_write_and_wait_transaction(NULL, root); 2492 ret = btrfs_write_and_wait_transaction(NULL, root);
2410 BUG_ON(ret); 2493 BUG_ON(ret);
@@ -2421,8 +2504,38 @@ int close_ctree(struct btrfs_root *root)
2421 fs_info->closing = 1; 2504 fs_info->closing = 1;
2422 smp_mb(); 2505 smp_mb();
2423 2506
2507 btrfs_scrub_cancel(root);
2508
2509 /* wait for any defraggers to finish */
2510 wait_event(fs_info->transaction_wait,
2511 (atomic_read(&fs_info->defrag_running) == 0));
2512
2513 /* clear out the rbtree of defraggable inodes */
2514 btrfs_run_defrag_inodes(root->fs_info);
2515
2516 btrfs_put_block_group_cache(fs_info);
2517
2518 /*
2519 * Here come 2 situations when btrfs is broken to flip readonly:
2520 *
2521 * 1. when btrfs flips readonly somewhere else before
2522 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
2523 * and btrfs will skip to write sb directly to keep
2524 * ERROR state on disk.
2525 *
2526 * 2. when btrfs flips readonly just in btrfs_commit_super,
2527 * and in such case, btrfs cannot write sb via btrfs_commit_super,
2528 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2529 * btrfs will cleanup all FS resources first and write sb then.
2530 */
2424 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2531 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2425 ret = btrfs_commit_super(root); 2532 ret = btrfs_commit_super(root);
2533 if (ret)
2534 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2535 }
2536
2537 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
2538 ret = btrfs_error_commit_super(root);
2426 if (ret) 2539 if (ret)
2427 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2540 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2428 } 2541 }
@@ -2458,6 +2571,7 @@ int close_ctree(struct btrfs_root *root)
2458 del_fs_roots(fs_info); 2571 del_fs_roots(fs_info);
2459 2572
2460 iput(fs_info->btree_inode); 2573 iput(fs_info->btree_inode);
2574 kfree(fs_info->delayed_root);
2461 2575
2462 btrfs_stop_workers(&fs_info->generic_worker); 2576 btrfs_stop_workers(&fs_info->generic_worker);
2463 btrfs_stop_workers(&fs_info->fixup_workers); 2577 btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2467,7 +2581,9 @@ int close_ctree(struct btrfs_root *root)
2467 btrfs_stop_workers(&fs_info->endio_meta_workers); 2581 btrfs_stop_workers(&fs_info->endio_meta_workers);
2468 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2582 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2469 btrfs_stop_workers(&fs_info->endio_write_workers); 2583 btrfs_stop_workers(&fs_info->endio_write_workers);
2584 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2470 btrfs_stop_workers(&fs_info->submit_workers); 2585 btrfs_stop_workers(&fs_info->submit_workers);
2586 btrfs_stop_workers(&fs_info->delayed_workers);
2471 2587
2472 btrfs_close_devices(fs_info->fs_devices); 2588 btrfs_close_devices(fs_info->fs_devices);
2473 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2589 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2480,6 +2596,8 @@ int close_ctree(struct btrfs_root *root)
2480 kfree(fs_info->chunk_root); 2596 kfree(fs_info->chunk_root);
2481 kfree(fs_info->dev_root); 2597 kfree(fs_info->dev_root);
2482 kfree(fs_info->csum_root); 2598 kfree(fs_info->csum_root);
2599 kfree(fs_info);
2600
2483 return 0; 2601 return 0;
2484} 2602}
2485 2603
@@ -2542,6 +2660,29 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2542 if (current->flags & PF_MEMALLOC) 2660 if (current->flags & PF_MEMALLOC)
2543 return; 2661 return;
2544 2662
2663 btrfs_balance_delayed_items(root);
2664
2665 num_dirty = root->fs_info->dirty_metadata_bytes;
2666
2667 if (num_dirty > thresh) {
2668 balance_dirty_pages_ratelimited_nr(
2669 root->fs_info->btree_inode->i_mapping, 1);
2670 }
2671 return;
2672}
2673
2674void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2675{
2676 /*
2677 * looks as though older kernels can get into trouble with
2678 * this code, they end up stuck in balance_dirty_pages forever
2679 */
2680 u64 num_dirty;
2681 unsigned long thresh = 32 * 1024 * 1024;
2682
2683 if (current->flags & PF_MEMALLOC)
2684 return;
2685
2545 num_dirty = root->fs_info->dirty_metadata_bytes; 2686 num_dirty = root->fs_info->dirty_metadata_bytes;
2546 2687
2547 if (num_dirty > thresh) { 2688 if (num_dirty > thresh) {
@@ -2574,7 +2715,7 @@ int btree_lock_page_hook(struct page *page)
2574 goto out; 2715 goto out;
2575 2716
2576 len = page->private >> 2; 2717 len = page->private >> 2;
2577 eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS); 2718 eb = find_extent_buffer(io_tree, bytenr, len);
2578 if (!eb) 2719 if (!eb)
2579 goto out; 2720 goto out;
2580 2721
@@ -2597,6 +2738,355 @@ out:
2597 return 0; 2738 return 0;
2598} 2739}
2599 2740
2741static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
2742 int read_only)
2743{
2744 if (read_only)
2745 return;
2746
2747 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2748 printk(KERN_WARNING "warning: mount fs with errors, "
2749 "running btrfsck is recommended\n");
2750}
2751
2752int btrfs_error_commit_super(struct btrfs_root *root)
2753{
2754 int ret;
2755
2756 mutex_lock(&root->fs_info->cleaner_mutex);
2757 btrfs_run_delayed_iputs(root);
2758 mutex_unlock(&root->fs_info->cleaner_mutex);
2759
2760 down_write(&root->fs_info->cleanup_work_sem);
2761 up_write(&root->fs_info->cleanup_work_sem);
2762
2763 /* cleanup FS via transaction */
2764 btrfs_cleanup_transaction(root);
2765
2766 ret = write_ctree_super(NULL, root, 0);
2767
2768 return ret;
2769}
2770
2771static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
2772{
2773 struct btrfs_inode *btrfs_inode;
2774 struct list_head splice;
2775
2776 INIT_LIST_HEAD(&splice);
2777
2778 mutex_lock(&root->fs_info->ordered_operations_mutex);
2779 spin_lock(&root->fs_info->ordered_extent_lock);
2780
2781 list_splice_init(&root->fs_info->ordered_operations, &splice);
2782 while (!list_empty(&splice)) {
2783 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2784 ordered_operations);
2785
2786 list_del_init(&btrfs_inode->ordered_operations);
2787
2788 btrfs_invalidate_inodes(btrfs_inode->root);
2789 }
2790
2791 spin_unlock(&root->fs_info->ordered_extent_lock);
2792 mutex_unlock(&root->fs_info->ordered_operations_mutex);
2793
2794 return 0;
2795}
2796
2797static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
2798{
2799 struct list_head splice;
2800 struct btrfs_ordered_extent *ordered;
2801 struct inode *inode;
2802
2803 INIT_LIST_HEAD(&splice);
2804
2805 spin_lock(&root->fs_info->ordered_extent_lock);
2806
2807 list_splice_init(&root->fs_info->ordered_extents, &splice);
2808 while (!list_empty(&splice)) {
2809 ordered = list_entry(splice.next, struct btrfs_ordered_extent,
2810 root_extent_list);
2811
2812 list_del_init(&ordered->root_extent_list);
2813 atomic_inc(&ordered->refs);
2814
2815 /* the inode may be getting freed (in sys_unlink path). */
2816 inode = igrab(ordered->inode);
2817
2818 spin_unlock(&root->fs_info->ordered_extent_lock);
2819 if (inode)
2820 iput(inode);
2821
2822 atomic_set(&ordered->refs, 1);
2823 btrfs_put_ordered_extent(ordered);
2824
2825 spin_lock(&root->fs_info->ordered_extent_lock);
2826 }
2827
2828 spin_unlock(&root->fs_info->ordered_extent_lock);
2829
2830 return 0;
2831}
2832
2833static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
2834 struct btrfs_root *root)
2835{
2836 struct rb_node *node;
2837 struct btrfs_delayed_ref_root *delayed_refs;
2838 struct btrfs_delayed_ref_node *ref;
2839 int ret = 0;
2840
2841 delayed_refs = &trans->delayed_refs;
2842
2843 spin_lock(&delayed_refs->lock);
2844 if (delayed_refs->num_entries == 0) {
2845 spin_unlock(&delayed_refs->lock);
2846 printk(KERN_INFO "delayed_refs has NO entry\n");
2847 return ret;
2848 }
2849
2850 node = rb_first(&delayed_refs->root);
2851 while (node) {
2852 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2853 node = rb_next(node);
2854
2855 ref->in_tree = 0;
2856 rb_erase(&ref->rb_node, &delayed_refs->root);
2857 delayed_refs->num_entries--;
2858
2859 atomic_set(&ref->refs, 1);
2860 if (btrfs_delayed_ref_is_head(ref)) {
2861 struct btrfs_delayed_ref_head *head;
2862
2863 head = btrfs_delayed_node_to_head(ref);
2864 mutex_lock(&head->mutex);
2865 kfree(head->extent_op);
2866 delayed_refs->num_heads--;
2867 if (list_empty(&head->cluster))
2868 delayed_refs->num_heads_ready--;
2869 list_del_init(&head->cluster);
2870 mutex_unlock(&head->mutex);
2871 }
2872
2873 spin_unlock(&delayed_refs->lock);
2874 btrfs_put_delayed_ref(ref);
2875
2876 cond_resched();
2877 spin_lock(&delayed_refs->lock);
2878 }
2879
2880 spin_unlock(&delayed_refs->lock);
2881
2882 return ret;
2883}
2884
2885static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
2886{
2887 struct btrfs_pending_snapshot *snapshot;
2888 struct list_head splice;
2889
2890 INIT_LIST_HEAD(&splice);
2891
2892 list_splice_init(&t->pending_snapshots, &splice);
2893
2894 while (!list_empty(&splice)) {
2895 snapshot = list_entry(splice.next,
2896 struct btrfs_pending_snapshot,
2897 list);
2898
2899 list_del_init(&snapshot->list);
2900
2901 kfree(snapshot);
2902 }
2903
2904 return 0;
2905}
2906
2907static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
2908{
2909 struct btrfs_inode *btrfs_inode;
2910 struct list_head splice;
2911
2912 INIT_LIST_HEAD(&splice);
2913
2914 spin_lock(&root->fs_info->delalloc_lock);
2915 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
2916
2917 while (!list_empty(&splice)) {
2918 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2919 delalloc_inodes);
2920
2921 list_del_init(&btrfs_inode->delalloc_inodes);
2922
2923 btrfs_invalidate_inodes(btrfs_inode->root);
2924 }
2925
2926 spin_unlock(&root->fs_info->delalloc_lock);
2927
2928 return 0;
2929}
2930
2931static int btrfs_destroy_marked_extents(struct btrfs_root *root,
2932 struct extent_io_tree *dirty_pages,
2933 int mark)
2934{
2935 int ret;
2936 struct page *page;
2937 struct inode *btree_inode = root->fs_info->btree_inode;
2938 struct extent_buffer *eb;
2939 u64 start = 0;
2940 u64 end;
2941 u64 offset;
2942 unsigned long index;
2943
2944 while (1) {
2945 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
2946 mark);
2947 if (ret)
2948 break;
2949
2950 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
2951 while (start <= end) {
2952 index = start >> PAGE_CACHE_SHIFT;
2953 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
2954 page = find_get_page(btree_inode->i_mapping, index);
2955 if (!page)
2956 continue;
2957 offset = page_offset(page);
2958
2959 spin_lock(&dirty_pages->buffer_lock);
2960 eb = radix_tree_lookup(
2961 &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
2962 offset >> PAGE_CACHE_SHIFT);
2963 spin_unlock(&dirty_pages->buffer_lock);
2964 if (eb) {
2965 ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
2966 &eb->bflags);
2967 atomic_set(&eb->refs, 1);
2968 }
2969 if (PageWriteback(page))
2970 end_page_writeback(page);
2971
2972 lock_page(page);
2973 if (PageDirty(page)) {
2974 clear_page_dirty_for_io(page);
2975 spin_lock_irq(&page->mapping->tree_lock);
2976 radix_tree_tag_clear(&page->mapping->page_tree,
2977 page_index(page),
2978 PAGECACHE_TAG_DIRTY);
2979 spin_unlock_irq(&page->mapping->tree_lock);
2980 }
2981
2982 page->mapping->a_ops->invalidatepage(page, 0);
2983 unlock_page(page);
2984 }
2985 }
2986
2987 return ret;
2988}
2989
2990static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
2991 struct extent_io_tree *pinned_extents)
2992{
2993 struct extent_io_tree *unpin;
2994 u64 start;
2995 u64 end;
2996 int ret;
2997
2998 unpin = pinned_extents;
2999 while (1) {
3000 ret = find_first_extent_bit(unpin, 0, &start, &end,
3001 EXTENT_DIRTY);
3002 if (ret)
3003 break;
3004
3005 /* opt_discard */
3006 if (btrfs_test_opt(root, DISCARD))
3007 ret = btrfs_error_discard_extent(root, start,
3008 end + 1 - start,
3009 NULL);
3010
3011 clear_extent_dirty(unpin, start, end, GFP_NOFS);
3012 btrfs_error_unpin_extent_range(root, start, end);
3013 cond_resched();
3014 }
3015
3016 return 0;
3017}
3018
3019static int btrfs_cleanup_transaction(struct btrfs_root *root)
3020{
3021 struct btrfs_transaction *t;
3022 LIST_HEAD(list);
3023
3024 WARN_ON(1);
3025
3026 mutex_lock(&root->fs_info->transaction_kthread_mutex);
3027
3028 spin_lock(&root->fs_info->trans_lock);
3029 list_splice_init(&root->fs_info->trans_list, &list);
3030 root->fs_info->trans_no_join = 1;
3031 spin_unlock(&root->fs_info->trans_lock);
3032
3033 while (!list_empty(&list)) {
3034 t = list_entry(list.next, struct btrfs_transaction, list);
3035 if (!t)
3036 break;
3037
3038 btrfs_destroy_ordered_operations(root);
3039
3040 btrfs_destroy_ordered_extents(root);
3041
3042 btrfs_destroy_delayed_refs(t, root);
3043
3044 btrfs_block_rsv_release(root,
3045 &root->fs_info->trans_block_rsv,
3046 t->dirty_pages.dirty_bytes);
3047
3048 /* FIXME: cleanup wait for commit */
3049 t->in_commit = 1;
3050 t->blocked = 1;
3051 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
3052 wake_up(&root->fs_info->transaction_blocked_wait);
3053
3054 t->blocked = 0;
3055 if (waitqueue_active(&root->fs_info->transaction_wait))
3056 wake_up(&root->fs_info->transaction_wait);
3057
3058 t->commit_done = 1;
3059 if (waitqueue_active(&t->commit_wait))
3060 wake_up(&t->commit_wait);
3061
3062 btrfs_destroy_pending_snapshots(t);
3063
3064 btrfs_destroy_delalloc_inodes(root);
3065
3066 spin_lock(&root->fs_info->trans_lock);
3067 root->fs_info->running_transaction = NULL;
3068 spin_unlock(&root->fs_info->trans_lock);
3069
3070 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3071 EXTENT_DIRTY);
3072
3073 btrfs_destroy_pinned_extent(root,
3074 root->fs_info->pinned_extents);
3075
3076 atomic_set(&t->use_count, 0);
3077 list_del_init(&t->list);
3078 memset(t, 0, sizeof(*t));
3079 kmem_cache_free(btrfs_transaction_cachep, t);
3080 }
3081
3082 spin_lock(&root->fs_info->trans_lock);
3083 root->fs_info->trans_no_join = 0;
3084 spin_unlock(&root->fs_info->trans_lock);
3085 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3086
3087 return 0;
3088}
3089
2600static struct extent_io_ops btree_extent_io_ops = { 3090static struct extent_io_ops btree_extent_io_ops = {
2601 .write_cache_pages_lock_hook = btree_lock_page_hook, 3091 .write_cache_pages_lock_hook = btree_lock_page_hook,
2602 .readpage_end_io_hook = btree_readpage_end_io_hook, 3092 .readpage_end_io_hook = btree_readpage_end_io_hook,