aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c640
1 files changed, 545 insertions, 95 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 51d2e4de34eb..68c84c8c24bd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,6 +29,7 @@
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/migrate.h> 31#include <linux/migrate.h>
32#include <asm/unaligned.h>
32#include "compat.h" 33#include "compat.h"
33#include "ctree.h" 34#include "ctree.h"
34#include "disk-io.h" 35#include "disk-io.h"
@@ -44,6 +45,20 @@
44static struct extent_io_ops btree_extent_io_ops; 45static struct extent_io_ops btree_extent_io_ops;
45static void end_workqueue_fn(struct btrfs_work *work); 46static void end_workqueue_fn(struct btrfs_work *work);
46static void free_fs_root(struct btrfs_root *root); 47static void free_fs_root(struct btrfs_root *root);
48static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
49 int read_only);
50static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
51static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
52static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
53 struct btrfs_root *root);
54static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
55static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
56static int btrfs_destroy_marked_extents(struct btrfs_root *root,
57 struct extent_io_tree *dirty_pages,
58 int mark);
59static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
60 struct extent_io_tree *pinned_extents);
61static int btrfs_cleanup_transaction(struct btrfs_root *root);
47 62
48/* 63/*
49 * end_io_wq structs are used to do processing in task context when an IO is 64 * end_io_wq structs are used to do processing in task context when an IO is
@@ -184,7 +199,7 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
184 199
185void btrfs_csum_final(u32 crc, char *result) 200void btrfs_csum_final(u32 crc, char *result)
186{ 201{
187 *(__le32 *)result = ~cpu_to_le32(crc); 202 put_unaligned_le32(~crc, result);
188} 203}
189 204
190/* 205/*
@@ -309,6 +324,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
309 int num_copies = 0; 324 int num_copies = 0;
310 int mirror_num = 0; 325 int mirror_num = 0;
311 326
327 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
312 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 328 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
313 while (1) { 329 while (1) {
314 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 330 ret = read_extent_buffer_pages(io_tree, eb, start, 1,
@@ -317,6 +333,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
317 !verify_parent_transid(io_tree, eb, parent_transid)) 333 !verify_parent_transid(io_tree, eb, parent_transid))
318 return ret; 334 return ret;
319 335
336 /*
337 * This buffer's crc is fine, but its contents are corrupted, so
338 * there is no reason to read the other copies, they won't be
339 * any less wrong.
340 */
341 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
342 return ret;
343
320 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 344 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
321 eb->start, eb->len); 345 eb->start, eb->len);
322 if (num_copies == 1) 346 if (num_copies == 1)
@@ -345,14 +369,22 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
345 369
346 tree = &BTRFS_I(page->mapping->host)->io_tree; 370 tree = &BTRFS_I(page->mapping->host)->io_tree;
347 371
348 if (page->private == EXTENT_PAGE_PRIVATE) 372 if (page->private == EXTENT_PAGE_PRIVATE) {
373 WARN_ON(1);
349 goto out; 374 goto out;
350 if (!page->private) 375 }
376 if (!page->private) {
377 WARN_ON(1);
351 goto out; 378 goto out;
379 }
352 len = page->private >> 2; 380 len = page->private >> 2;
353 WARN_ON(len == 0); 381 WARN_ON(len == 0);
354 382
355 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 383 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
384 if (eb == NULL) {
385 WARN_ON(1);
386 goto out;
387 }
356 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, 388 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
357 btrfs_header_generation(eb)); 389 btrfs_header_generation(eb));
358 BUG_ON(ret); 390 BUG_ON(ret);
@@ -397,6 +429,73 @@ static int check_tree_block_fsid(struct btrfs_root *root,
397 return ret; 429 return ret;
398} 430}
399 431
432#define CORRUPT(reason, eb, root, slot) \
433 printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
434 "root=%llu, slot=%d\n", reason, \
435 (unsigned long long)btrfs_header_bytenr(eb), \
436 (unsigned long long)root->objectid, slot)
437
438static noinline int check_leaf(struct btrfs_root *root,
439 struct extent_buffer *leaf)
440{
441 struct btrfs_key key;
442 struct btrfs_key leaf_key;
443 u32 nritems = btrfs_header_nritems(leaf);
444 int slot;
445
446 if (nritems == 0)
447 return 0;
448
449 /* Check the 0 item */
450 if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
451 BTRFS_LEAF_DATA_SIZE(root)) {
452 CORRUPT("invalid item offset size pair", leaf, root, 0);
453 return -EIO;
454 }
455
456 /*
457 * Check to make sure each items keys are in the correct order and their
458 * offsets make sense. We only have to loop through nritems-1 because
459 * we check the current slot against the next slot, which verifies the
460 * next slot's offset+size makes sense and that the current's slot
461 * offset is correct.
462 */
463 for (slot = 0; slot < nritems - 1; slot++) {
464 btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
465 btrfs_item_key_to_cpu(leaf, &key, slot + 1);
466
467 /* Make sure the keys are in the right order */
468 if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
469 CORRUPT("bad key order", leaf, root, slot);
470 return -EIO;
471 }
472
473 /*
474 * Make sure the offset and ends are right, remember that the
475 * item data starts at the end of the leaf and grows towards the
476 * front.
477 */
478 if (btrfs_item_offset_nr(leaf, slot) !=
479 btrfs_item_end_nr(leaf, slot + 1)) {
480 CORRUPT("slot offset bad", leaf, root, slot);
481 return -EIO;
482 }
483
484 /*
485 * Check to make sure that we don't point outside of the leaf,
486 * just incase all the items are consistent to eachother, but
487 * all point outside of the leaf.
488 */
489 if (btrfs_item_end_nr(leaf, slot) >
490 BTRFS_LEAF_DATA_SIZE(root)) {
491 CORRUPT("slot end outside of leaf", leaf, root, slot);
492 return -EIO;
493 }
494 }
495
496 return 0;
497}
498
400#ifdef CONFIG_DEBUG_LOCK_ALLOC 499#ifdef CONFIG_DEBUG_LOCK_ALLOC
401void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) 500void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
402{ 501{
@@ -427,6 +526,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
427 WARN_ON(len == 0); 526 WARN_ON(len == 0);
428 527
429 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 528 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
529 if (eb == NULL) {
530 ret = -EIO;
531 goto out;
532 }
430 533
431 found_start = btrfs_header_bytenr(eb); 534 found_start = btrfs_header_bytenr(eb);
432 if (found_start != start) { 535 if (found_start != start) {
@@ -459,8 +562,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
459 btrfs_set_buffer_lockdep_class(eb, found_level); 562 btrfs_set_buffer_lockdep_class(eb, found_level);
460 563
461 ret = csum_tree_block(root, eb, 1); 564 ret = csum_tree_block(root, eb, 1);
462 if (ret) 565 if (ret) {
566 ret = -EIO;
567 goto err;
568 }
569
570 /*
571 * If this is a leaf block and it is corrupt, set the corrupt bit so
572 * that we don't try and read the other copies of this block, just
573 * return -EIO.
574 */
575 if (found_level == 0 && check_leaf(root, eb)) {
576 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
463 ret = -EIO; 577 ret = -EIO;
578 }
464 579
465 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 580 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
466 end = eb->start + end - 1; 581 end = eb->start + end - 1;
@@ -821,7 +936,6 @@ static const struct address_space_operations btree_aops = {
821 .writepages = btree_writepages, 936 .writepages = btree_writepages,
822 .releasepage = btree_releasepage, 937 .releasepage = btree_releasepage,
823 .invalidatepage = btree_invalidatepage, 938 .invalidatepage = btree_invalidatepage,
824 .sync_page = block_sync_page,
825#ifdef CONFIG_MIGRATION 939#ifdef CONFIG_MIGRATION
826 .migratepage = btree_migratepage, 940 .migratepage = btree_migratepage,
827#endif 941#endif
@@ -1134,7 +1248,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1134 root, fs_info, location->objectid); 1248 root, fs_info, location->objectid);
1135 1249
1136 path = btrfs_alloc_path(); 1250 path = btrfs_alloc_path();
1137 BUG_ON(!path); 1251 if (!path) {
1252 kfree(root);
1253 return ERR_PTR(-ENOMEM);
1254 }
1138 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); 1255 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1139 if (ret == 0) { 1256 if (ret == 0) {
1140 l = path->nodes[0]; 1257 l = path->nodes[0];
@@ -1145,6 +1262,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1145 } 1262 }
1146 btrfs_free_path(path); 1263 btrfs_free_path(path);
1147 if (ret) { 1264 if (ret) {
1265 kfree(root);
1148 if (ret > 0) 1266 if (ret > 0)
1149 ret = -ENOENT; 1267 ret = -ENOENT;
1150 return ERR_PTR(ret); 1268 return ERR_PTR(ret);
@@ -1157,8 +1275,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1157 root->commit_root = btrfs_root_node(root); 1275 root->commit_root = btrfs_root_node(root);
1158 BUG_ON(!root->node); 1276 BUG_ON(!root->node);
1159out: 1277out:
1160 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) 1278 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1161 root->ref_cows = 1; 1279 root->ref_cows = 1;
1280 btrfs_check_and_init_root_item(&root->root_item);
1281 }
1162 1282
1163 return root; 1283 return root;
1164} 1284}
@@ -1304,82 +1424,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1304} 1424}
1305 1425
1306/* 1426/*
1307 * this unplugs every device on the box, and it is only used when page
1308 * is null
1309 */
1310static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1311{
1312 struct btrfs_device *device;
1313 struct btrfs_fs_info *info;
1314
1315 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1316 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1317 if (!device->bdev)
1318 continue;
1319
1320 bdi = blk_get_backing_dev_info(device->bdev);
1321 if (bdi->unplug_io_fn)
1322 bdi->unplug_io_fn(bdi, page);
1323 }
1324}
1325
1326static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1327{
1328 struct inode *inode;
1329 struct extent_map_tree *em_tree;
1330 struct extent_map *em;
1331 struct address_space *mapping;
1332 u64 offset;
1333
1334 /* the generic O_DIRECT read code does this */
1335 if (1 || !page) {
1336 __unplug_io_fn(bdi, page);
1337 return;
1338 }
1339
1340 /*
1341 * page->mapping may change at any time. Get a consistent copy
1342 * and use that for everything below
1343 */
1344 smp_mb();
1345 mapping = page->mapping;
1346 if (!mapping)
1347 return;
1348
1349 inode = mapping->host;
1350
1351 /*
1352 * don't do the expensive searching for a small number of
1353 * devices
1354 */
1355 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1356 __unplug_io_fn(bdi, page);
1357 return;
1358 }
1359
1360 offset = page_offset(page);
1361
1362 em_tree = &BTRFS_I(inode)->extent_tree;
1363 read_lock(&em_tree->lock);
1364 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1365 read_unlock(&em_tree->lock);
1366 if (!em) {
1367 __unplug_io_fn(bdi, page);
1368 return;
1369 }
1370
1371 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1372 free_extent_map(em);
1373 __unplug_io_fn(bdi, page);
1374 return;
1375 }
1376 offset = offset - em->start;
1377 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1378 em->block_start + offset, page);
1379 free_extent_map(em);
1380}
1381
1382/*
1383 * If this fails, caller must call bdi_destroy() to get rid of the 1427 * If this fails, caller must call bdi_destroy() to get rid of the
1384 * bdi again. 1428 * bdi again.
1385 */ 1429 */
@@ -1393,8 +1437,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1393 return err; 1437 return err;
1394 1438
1395 bdi->ra_pages = default_backing_dev_info.ra_pages; 1439 bdi->ra_pages = default_backing_dev_info.ra_pages;
1396 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1397 bdi->unplug_io_data = info;
1398 bdi->congested_fn = btrfs_congested_fn; 1440 bdi->congested_fn = btrfs_congested_fn;
1399 bdi->congested_data = info; 1441 bdi->congested_data = info;
1400 return 0; 1442 return 0;
@@ -1527,6 +1569,7 @@ static int transaction_kthread(void *arg)
1527 spin_unlock(&root->fs_info->new_trans_lock); 1569 spin_unlock(&root->fs_info->new_trans_lock);
1528 1570
1529 trans = btrfs_join_transaction(root, 1); 1571 trans = btrfs_join_transaction(root, 1);
1572 BUG_ON(IS_ERR(trans));
1530 if (transid == trans->transid) { 1573 if (transid == trans->transid) {
1531 ret = btrfs_commit_transaction(trans, root); 1574 ret = btrfs_commit_transaction(trans, root);
1532 BUG_ON(ret); 1575 BUG_ON(ret);
@@ -1604,6 +1647,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1604 goto fail_bdi; 1647 goto fail_bdi;
1605 } 1648 }
1606 1649
1650 fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
1651
1607 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 1652 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1608 INIT_LIST_HEAD(&fs_info->trans_list); 1653 INIT_LIST_HEAD(&fs_info->trans_list);
1609 INIT_LIST_HEAD(&fs_info->dead_roots); 1654 INIT_LIST_HEAD(&fs_info->dead_roots);
@@ -1713,8 +1758,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1713 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1758 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1714 1759
1715 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1760 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1716 if (!bh) 1761 if (!bh) {
1762 err = -EINVAL;
1717 goto fail_iput; 1763 goto fail_iput;
1764 }
1718 1765
1719 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 1766 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1720 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 1767 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1727,6 +1774,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1727 if (!btrfs_super_root(disk_super)) 1774 if (!btrfs_super_root(disk_super))
1728 goto fail_iput; 1775 goto fail_iput;
1729 1776
1777 /* check FS state, whether FS is broken. */
1778 fs_info->fs_state |= btrfs_super_flags(disk_super);
1779
1780 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1781
1782 /*
1783 * In the long term, we'll store the compression type in the super
1784 * block, and it'll be used for per file compression control.
1785 */
1786 fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
1787
1730 ret = btrfs_parse_options(tree_root, options); 1788 ret = btrfs_parse_options(tree_root, options);
1731 if (ret) { 1789 if (ret) {
1732 err = ret; 1790 err = ret;
@@ -1744,10 +1802,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1744 } 1802 }
1745 1803
1746 features = btrfs_super_incompat_flags(disk_super); 1804 features = btrfs_super_incompat_flags(disk_super);
1747 if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { 1805 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
1748 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; 1806 if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
1749 btrfs_set_super_incompat_flags(disk_super, features); 1807 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1750 } 1808 btrfs_set_super_incompat_flags(disk_super, features);
1751 1809
1752 features = btrfs_super_compat_ro_flags(disk_super) & 1810 features = btrfs_super_compat_ro_flags(disk_super) &
1753 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 1811 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1932,6 +1990,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1932 fs_info->metadata_alloc_profile = (u64)-1; 1990 fs_info->metadata_alloc_profile = (u64)-1;
1933 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1991 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1934 1992
1993 ret = btrfs_init_space_info(fs_info);
1994 if (ret) {
1995 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
1996 goto fail_block_groups;
1997 }
1998
1935 ret = btrfs_read_block_groups(extent_root); 1999 ret = btrfs_read_block_groups(extent_root);
1936 if (ret) { 2000 if (ret) {
1937 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 2001 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
@@ -1957,7 +2021,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1957 btrfs_set_opt(fs_info->mount_opt, SSD); 2021 btrfs_set_opt(fs_info->mount_opt, SSD);
1958 } 2022 }
1959 2023
1960 if (btrfs_super_log_root(disk_super) != 0) { 2024 /* do not make disk changes in broken FS */
2025 if (btrfs_super_log_root(disk_super) != 0 &&
2026 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
1961 u64 bytenr = btrfs_super_log_root(disk_super); 2027 u64 bytenr = btrfs_super_log_root(disk_super);
1962 2028
1963 if (fs_devices->rw_devices == 0) { 2029 if (fs_devices->rw_devices == 0) {
@@ -2021,9 +2087,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
2021 2087
2022 if (!(sb->s_flags & MS_RDONLY)) { 2088 if (!(sb->s_flags & MS_RDONLY)) {
2023 down_read(&fs_info->cleanup_work_sem); 2089 down_read(&fs_info->cleanup_work_sem);
2024 btrfs_orphan_cleanup(fs_info->fs_root); 2090 err = btrfs_orphan_cleanup(fs_info->fs_root);
2025 btrfs_orphan_cleanup(fs_info->tree_root); 2091 if (!err)
2092 err = btrfs_orphan_cleanup(fs_info->tree_root);
2026 up_read(&fs_info->cleanup_work_sem); 2093 up_read(&fs_info->cleanup_work_sem);
2094 if (err) {
2095 close_ctree(tree_root);
2096 return ERR_PTR(err);
2097 }
2027 } 2098 }
2028 2099
2029 return tree_root; 2100 return tree_root;
@@ -2398,8 +2469,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2398 2469
2399 root_objectid = gang[ret - 1]->root_key.objectid + 1; 2470 root_objectid = gang[ret - 1]->root_key.objectid + 1;
2400 for (i = 0; i < ret; i++) { 2471 for (i = 0; i < ret; i++) {
2472 int err;
2473
2401 root_objectid = gang[i]->root_key.objectid; 2474 root_objectid = gang[i]->root_key.objectid;
2402 btrfs_orphan_cleanup(gang[i]); 2475 err = btrfs_orphan_cleanup(gang[i]);
2476 if (err)
2477 return err;
2403 } 2478 }
2404 root_objectid++; 2479 root_objectid++;
2405 } 2480 }
@@ -2421,10 +2496,14 @@ int btrfs_commit_super(struct btrfs_root *root)
2421 up_write(&root->fs_info->cleanup_work_sem); 2496 up_write(&root->fs_info->cleanup_work_sem);
2422 2497
2423 trans = btrfs_join_transaction(root, 1); 2498 trans = btrfs_join_transaction(root, 1);
2499 if (IS_ERR(trans))
2500 return PTR_ERR(trans);
2424 ret = btrfs_commit_transaction(trans, root); 2501 ret = btrfs_commit_transaction(trans, root);
2425 BUG_ON(ret); 2502 BUG_ON(ret);
2426 /* run commit again to drop the original snapshot */ 2503 /* run commit again to drop the original snapshot */
2427 trans = btrfs_join_transaction(root, 1); 2504 trans = btrfs_join_transaction(root, 1);
2505 if (IS_ERR(trans))
2506 return PTR_ERR(trans);
2428 btrfs_commit_transaction(trans, root); 2507 btrfs_commit_transaction(trans, root);
2429 ret = btrfs_write_and_wait_transaction(NULL, root); 2508 ret = btrfs_write_and_wait_transaction(NULL, root);
2430 BUG_ON(ret); 2509 BUG_ON(ret);
@@ -2442,8 +2521,28 @@ int close_ctree(struct btrfs_root *root)
2442 smp_mb(); 2521 smp_mb();
2443 2522
2444 btrfs_put_block_group_cache(fs_info); 2523 btrfs_put_block_group_cache(fs_info);
2524
2525 /*
2526 * Here come 2 situations when btrfs is broken to flip readonly:
2527 *
2528 * 1. when btrfs flips readonly somewhere else before
2529 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
2530 * and btrfs will skip to write sb directly to keep
2531 * ERROR state on disk.
2532 *
2533 * 2. when btrfs flips readonly just in btrfs_commit_super,
2534 * and in such case, btrfs cannot write sb via btrfs_commit_super,
2535 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2536 * btrfs will cleanup all FS resources first and write sb then.
2537 */
2445 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2538 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2446 ret = btrfs_commit_super(root); 2539 ret = btrfs_commit_super(root);
2540 if (ret)
2541 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2542 }
2543
2544 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
2545 ret = btrfs_error_commit_super(root);
2447 if (ret) 2546 if (ret)
2448 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2547 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2449 } 2548 }
@@ -2502,6 +2601,8 @@ int close_ctree(struct btrfs_root *root)
2502 kfree(fs_info->chunk_root); 2601 kfree(fs_info->chunk_root);
2503 kfree(fs_info->dev_root); 2602 kfree(fs_info->dev_root);
2504 kfree(fs_info->csum_root); 2603 kfree(fs_info->csum_root);
2604 kfree(fs_info);
2605
2505 return 0; 2606 return 0;
2506} 2607}
2507 2608
@@ -2619,6 +2720,355 @@ out:
2619 return 0; 2720 return 0;
2620} 2721}
2621 2722
2723static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
2724 int read_only)
2725{
2726 if (read_only)
2727 return;
2728
2729 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2730 printk(KERN_WARNING "warning: mount fs with errors, "
2731 "running btrfsck is recommended\n");
2732}
2733
2734int btrfs_error_commit_super(struct btrfs_root *root)
2735{
2736 int ret;
2737
2738 mutex_lock(&root->fs_info->cleaner_mutex);
2739 btrfs_run_delayed_iputs(root);
2740 mutex_unlock(&root->fs_info->cleaner_mutex);
2741
2742 down_write(&root->fs_info->cleanup_work_sem);
2743 up_write(&root->fs_info->cleanup_work_sem);
2744
2745 /* cleanup FS via transaction */
2746 btrfs_cleanup_transaction(root);
2747
2748 ret = write_ctree_super(NULL, root, 0);
2749
2750 return ret;
2751}
2752
2753static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
2754{
2755 struct btrfs_inode *btrfs_inode;
2756 struct list_head splice;
2757
2758 INIT_LIST_HEAD(&splice);
2759
2760 mutex_lock(&root->fs_info->ordered_operations_mutex);
2761 spin_lock(&root->fs_info->ordered_extent_lock);
2762
2763 list_splice_init(&root->fs_info->ordered_operations, &splice);
2764 while (!list_empty(&splice)) {
2765 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2766 ordered_operations);
2767
2768 list_del_init(&btrfs_inode->ordered_operations);
2769
2770 btrfs_invalidate_inodes(btrfs_inode->root);
2771 }
2772
2773 spin_unlock(&root->fs_info->ordered_extent_lock);
2774 mutex_unlock(&root->fs_info->ordered_operations_mutex);
2775
2776 return 0;
2777}
2778
2779static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
2780{
2781 struct list_head splice;
2782 struct btrfs_ordered_extent *ordered;
2783 struct inode *inode;
2784
2785 INIT_LIST_HEAD(&splice);
2786
2787 spin_lock(&root->fs_info->ordered_extent_lock);
2788
2789 list_splice_init(&root->fs_info->ordered_extents, &splice);
2790 while (!list_empty(&splice)) {
2791 ordered = list_entry(splice.next, struct btrfs_ordered_extent,
2792 root_extent_list);
2793
2794 list_del_init(&ordered->root_extent_list);
2795 atomic_inc(&ordered->refs);
2796
2797 /* the inode may be getting freed (in sys_unlink path). */
2798 inode = igrab(ordered->inode);
2799
2800 spin_unlock(&root->fs_info->ordered_extent_lock);
2801 if (inode)
2802 iput(inode);
2803
2804 atomic_set(&ordered->refs, 1);
2805 btrfs_put_ordered_extent(ordered);
2806
2807 spin_lock(&root->fs_info->ordered_extent_lock);
2808 }
2809
2810 spin_unlock(&root->fs_info->ordered_extent_lock);
2811
2812 return 0;
2813}
2814
2815static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
2816 struct btrfs_root *root)
2817{
2818 struct rb_node *node;
2819 struct btrfs_delayed_ref_root *delayed_refs;
2820 struct btrfs_delayed_ref_node *ref;
2821 int ret = 0;
2822
2823 delayed_refs = &trans->delayed_refs;
2824
2825 spin_lock(&delayed_refs->lock);
2826 if (delayed_refs->num_entries == 0) {
2827 printk(KERN_INFO "delayed_refs has NO entry\n");
2828 return ret;
2829 }
2830
2831 node = rb_first(&delayed_refs->root);
2832 while (node) {
2833 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2834 node = rb_next(node);
2835
2836 ref->in_tree = 0;
2837 rb_erase(&ref->rb_node, &delayed_refs->root);
2838 delayed_refs->num_entries--;
2839
2840 atomic_set(&ref->refs, 1);
2841 if (btrfs_delayed_ref_is_head(ref)) {
2842 struct btrfs_delayed_ref_head *head;
2843
2844 head = btrfs_delayed_node_to_head(ref);
2845 mutex_lock(&head->mutex);
2846 kfree(head->extent_op);
2847 delayed_refs->num_heads--;
2848 if (list_empty(&head->cluster))
2849 delayed_refs->num_heads_ready--;
2850 list_del_init(&head->cluster);
2851 mutex_unlock(&head->mutex);
2852 }
2853
2854 spin_unlock(&delayed_refs->lock);
2855 btrfs_put_delayed_ref(ref);
2856
2857 cond_resched();
2858 spin_lock(&delayed_refs->lock);
2859 }
2860
2861 spin_unlock(&delayed_refs->lock);
2862
2863 return ret;
2864}
2865
2866static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
2867{
2868 struct btrfs_pending_snapshot *snapshot;
2869 struct list_head splice;
2870
2871 INIT_LIST_HEAD(&splice);
2872
2873 list_splice_init(&t->pending_snapshots, &splice);
2874
2875 while (!list_empty(&splice)) {
2876 snapshot = list_entry(splice.next,
2877 struct btrfs_pending_snapshot,
2878 list);
2879
2880 list_del_init(&snapshot->list);
2881
2882 kfree(snapshot);
2883 }
2884
2885 return 0;
2886}
2887
2888static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
2889{
2890 struct btrfs_inode *btrfs_inode;
2891 struct list_head splice;
2892
2893 INIT_LIST_HEAD(&splice);
2894
2895 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
2896
2897 spin_lock(&root->fs_info->delalloc_lock);
2898
2899 while (!list_empty(&splice)) {
2900 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2901 delalloc_inodes);
2902
2903 list_del_init(&btrfs_inode->delalloc_inodes);
2904
2905 btrfs_invalidate_inodes(btrfs_inode->root);
2906 }
2907
2908 spin_unlock(&root->fs_info->delalloc_lock);
2909
2910 return 0;
2911}
2912
2913static int btrfs_destroy_marked_extents(struct btrfs_root *root,
2914 struct extent_io_tree *dirty_pages,
2915 int mark)
2916{
2917 int ret;
2918 struct page *page;
2919 struct inode *btree_inode = root->fs_info->btree_inode;
2920 struct extent_buffer *eb;
2921 u64 start = 0;
2922 u64 end;
2923 u64 offset;
2924 unsigned long index;
2925
2926 while (1) {
2927 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
2928 mark);
2929 if (ret)
2930 break;
2931
2932 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
2933 while (start <= end) {
2934 index = start >> PAGE_CACHE_SHIFT;
2935 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
2936 page = find_get_page(btree_inode->i_mapping, index);
2937 if (!page)
2938 continue;
2939 offset = page_offset(page);
2940
2941 spin_lock(&dirty_pages->buffer_lock);
2942 eb = radix_tree_lookup(
2943 &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
2944 offset >> PAGE_CACHE_SHIFT);
2945 spin_unlock(&dirty_pages->buffer_lock);
2946 if (eb) {
2947 ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
2948 &eb->bflags);
2949 atomic_set(&eb->refs, 1);
2950 }
2951 if (PageWriteback(page))
2952 end_page_writeback(page);
2953
2954 lock_page(page);
2955 if (PageDirty(page)) {
2956 clear_page_dirty_for_io(page);
2957 spin_lock_irq(&page->mapping->tree_lock);
2958 radix_tree_tag_clear(&page->mapping->page_tree,
2959 page_index(page),
2960 PAGECACHE_TAG_DIRTY);
2961 spin_unlock_irq(&page->mapping->tree_lock);
2962 }
2963
2964 page->mapping->a_ops->invalidatepage(page, 0);
2965 unlock_page(page);
2966 }
2967 }
2968
2969 return ret;
2970}
2971
2972static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
2973 struct extent_io_tree *pinned_extents)
2974{
2975 struct extent_io_tree *unpin;
2976 u64 start;
2977 u64 end;
2978 int ret;
2979
2980 unpin = pinned_extents;
2981 while (1) {
2982 ret = find_first_extent_bit(unpin, 0, &start, &end,
2983 EXTENT_DIRTY);
2984 if (ret)
2985 break;
2986
2987 /* opt_discard */
2988 if (btrfs_test_opt(root, DISCARD))
2989 ret = btrfs_error_discard_extent(root, start,
2990 end + 1 - start,
2991 NULL);
2992
2993 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2994 btrfs_error_unpin_extent_range(root, start, end);
2995 cond_resched();
2996 }
2997
2998 return 0;
2999}
3000
3001static int btrfs_cleanup_transaction(struct btrfs_root *root)
3002{
3003 struct btrfs_transaction *t;
3004 LIST_HEAD(list);
3005
3006 WARN_ON(1);
3007
3008 mutex_lock(&root->fs_info->trans_mutex);
3009 mutex_lock(&root->fs_info->transaction_kthread_mutex);
3010
3011 list_splice_init(&root->fs_info->trans_list, &list);
3012 while (!list_empty(&list)) {
3013 t = list_entry(list.next, struct btrfs_transaction, list);
3014 if (!t)
3015 break;
3016
3017 btrfs_destroy_ordered_operations(root);
3018
3019 btrfs_destroy_ordered_extents(root);
3020
3021 btrfs_destroy_delayed_refs(t, root);
3022
3023 btrfs_block_rsv_release(root,
3024 &root->fs_info->trans_block_rsv,
3025 t->dirty_pages.dirty_bytes);
3026
3027 /* FIXME: cleanup wait for commit */
3028 t->in_commit = 1;
3029 t->blocked = 1;
3030 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
3031 wake_up(&root->fs_info->transaction_blocked_wait);
3032
3033 t->blocked = 0;
3034 if (waitqueue_active(&root->fs_info->transaction_wait))
3035 wake_up(&root->fs_info->transaction_wait);
3036 mutex_unlock(&root->fs_info->trans_mutex);
3037
3038 mutex_lock(&root->fs_info->trans_mutex);
3039 t->commit_done = 1;
3040 if (waitqueue_active(&t->commit_wait))
3041 wake_up(&t->commit_wait);
3042 mutex_unlock(&root->fs_info->trans_mutex);
3043
3044 mutex_lock(&root->fs_info->trans_mutex);
3045
3046 btrfs_destroy_pending_snapshots(t);
3047
3048 btrfs_destroy_delalloc_inodes(root);
3049
3050 spin_lock(&root->fs_info->new_trans_lock);
3051 root->fs_info->running_transaction = NULL;
3052 spin_unlock(&root->fs_info->new_trans_lock);
3053
3054 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3055 EXTENT_DIRTY);
3056
3057 btrfs_destroy_pinned_extent(root,
3058 root->fs_info->pinned_extents);
3059
3060 atomic_set(&t->use_count, 0);
3061 list_del_init(&t->list);
3062 memset(t, 0, sizeof(*t));
3063 kmem_cache_free(btrfs_transaction_cachep, t);
3064 }
3065
3066 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3067 mutex_unlock(&root->fs_info->trans_mutex);
3068
3069 return 0;
3070}
3071
2622static struct extent_io_ops btree_extent_io_ops = { 3072static struct extent_io_ops btree_extent_io_ops = {
2623 .write_cache_pages_lock_hook = btree_lock_page_hook, 3073 .write_cache_pages_lock_hook = btree_lock_page_hook,
2624 .readpage_end_io_hook = btree_readpage_end_io_hook, 3074 .readpage_end_io_hook = btree_readpage_end_io_hook,