aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2010-08-06 13:21:20 -0400
committerChris Mason <chris.mason@oracle.com>2012-03-26 16:50:37 -0400
commit727011e07cbdf87772fcc1999cccd15cc915eb62 (patch)
tree05405dc1e9c86d67dbb02ddf063bd0c137ce6707
parent81c9ad237c604adec79fd4d4034264c6669e0ab3 (diff)
Btrfs: allow metadata blocks larger than the page size
A few years ago the btrfs code to support blocks lager than the page size was disabled to fix a few corner cases in the page cache handling. This fixes the code to properly support large metadata blocks again. Since current kernels will crash early and often with larger metadata blocks, this adds an incompat bit so that older kernels can't mount it. This also does away with different blocksizes for nodes and leaves. You get a single block size for all tree blocks. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/ctree.h24
-rw-r--r--fs/btrfs/disk-io.c196
-rw-r--r--fs/btrfs/extent_io.c144
-rw-r--r--fs/btrfs/extent_io.h12
-rw-r--r--fs/btrfs/inode-item.c1
-rw-r--r--fs/btrfs/volumes.c2
6 files changed, 190 insertions, 189 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index edccc948e877..85ab1c5844a2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -138,6 +138,12 @@ struct btrfs_ordered_sum;
138#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 138#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
139 139
140/* 140/*
141 * the max metadata block size. This limit is somewhat artificial,
142 * but the memmove costs go through the roof for larger blocks.
143 */
144#define BTRFS_MAX_METADATA_BLOCKSIZE 65536
145
146/*
141 * we can actually store much bigger names, but lets not confuse the rest 147 * we can actually store much bigger names, but lets not confuse the rest
142 * of linux 148 * of linux
143 */ 149 */
@@ -461,6 +467,19 @@ struct btrfs_super_block {
461#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) 467#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
462#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) 468#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
463#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) 469#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
470/*
471 * some patches floated around with a second compression method
472 * lets save that incompat here for when they do get in
473 * Note we don't actually support it, we're just reserving the
474 * number
475 */
476#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4)
477
478/*
479 * older kernels tried to do bigger metadata blocks, but the
480 * code was pretty buggy. Lets not let them try anymore.
481 */
482#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
464 483
465#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 484#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
466#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 485#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
@@ -468,6 +487,7 @@ struct btrfs_super_block {
468 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 487 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
469 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ 488 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
470 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ 489 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
490 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
471 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) 491 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
472 492
473/* 493/*
@@ -1555,14 +1575,14 @@ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
1555#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ 1575#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
1556static inline u##bits btrfs_##name(struct extent_buffer *eb) \ 1576static inline u##bits btrfs_##name(struct extent_buffer *eb) \
1557{ \ 1577{ \
1558 type *p = page_address(eb->first_page); \ 1578 type *p = page_address(eb->pages[0]); \
1559 u##bits res = le##bits##_to_cpu(p->member); \ 1579 u##bits res = le##bits##_to_cpu(p->member); \
1560 return res; \ 1580 return res; \
1561} \ 1581} \
1562static inline void btrfs_set_##name(struct extent_buffer *eb, \ 1582static inline void btrfs_set_##name(struct extent_buffer *eb, \
1563 u##bits val) \ 1583 u##bits val) \
1564{ \ 1584{ \
1565 type *p = page_address(eb->first_page); \ 1585 type *p = page_address(eb->pages[0]); \
1566 p->member = cpu_to_le##bits(val); \ 1586 p->member = cpu_to_le##bits(val); \
1567} 1587}
1568 1588
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 534266fe505f..68fc93e18db8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -370,8 +370,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
370 ret = read_extent_buffer_pages(io_tree, eb, start, 370 ret = read_extent_buffer_pages(io_tree, eb, start,
371 WAIT_COMPLETE, 371 WAIT_COMPLETE,
372 btree_get_extent, mirror_num); 372 btree_get_extent, mirror_num);
373 if (!ret && 373 if (!ret && !verify_parent_transid(io_tree, eb, parent_transid))
374 !verify_parent_transid(io_tree, eb, parent_transid))
375 return ret; 374 return ret;
376 375
377 /* 376 /*
@@ -406,14 +405,11 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
406 u64 found_start; 405 u64 found_start;
407 unsigned long len; 406 unsigned long len;
408 struct extent_buffer *eb; 407 struct extent_buffer *eb;
409 int ret;
410 408
411 tree = &BTRFS_I(page->mapping->host)->io_tree; 409 tree = &BTRFS_I(page->mapping->host)->io_tree;
412 410
413 if (page->private == EXTENT_PAGE_PRIVATE) { 411 if (page->private == EXTENT_PAGE_PRIVATE)
414 WARN_ON(1);
415 goto out; 412 goto out;
416 }
417 if (!page->private) { 413 if (!page->private) {
418 WARN_ON(1); 414 WARN_ON(1);
419 goto out; 415 goto out;
@@ -421,22 +417,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
421 len = page->private >> 2; 417 len = page->private >> 2;
422 WARN_ON(len == 0); 418 WARN_ON(len == 0);
423 419
424 eb = alloc_extent_buffer(tree, start, len, page); 420 eb = find_extent_buffer(tree, start, len);
425 if (eb == NULL) {
426 WARN_ON(1);
427 goto out;
428 }
429 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
430 btrfs_header_generation(eb));
431 BUG_ON(ret);
432 WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
433 421
434 found_start = btrfs_header_bytenr(eb); 422 found_start = btrfs_header_bytenr(eb);
435 if (found_start != start) { 423 if (found_start != start) {
436 WARN_ON(1); 424 WARN_ON(1);
437 goto err; 425 goto err;
438 } 426 }
439 if (eb->first_page != page) { 427 if (eb->pages[0] != page) {
440 WARN_ON(1); 428 WARN_ON(1);
441 goto err; 429 goto err;
442 } 430 }
@@ -537,6 +525,41 @@ static noinline int check_leaf(struct btrfs_root *root,
537 return 0; 525 return 0;
538} 526}
539 527
528struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree,
529 struct page *page, int max_walk)
530{
531 struct extent_buffer *eb;
532 u64 start = page_offset(page);
533 u64 target = start;
534 u64 min_start;
535
536 if (start < max_walk)
537 min_start = 0;
538 else
539 min_start = start - max_walk;
540
541 while (start >= min_start) {
542 eb = find_extent_buffer(tree, start, 0);
543 if (eb) {
544 /*
545 * we found an extent buffer and it contains our page
546 * horray!
547 */
548 if (eb->start <= target &&
549 eb->start + eb->len > target)
550 return eb;
551
552 /* we found an extent buffer that wasn't for us */
553 free_extent_buffer(eb);
554 return NULL;
555 }
556 if (start == 0)
557 break;
558 start -= PAGE_CACHE_SIZE;
559 }
560 return NULL;
561}
562
540static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, 563static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
541 struct extent_state *state) 564 struct extent_state *state)
542{ 565{
@@ -547,24 +570,25 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
547 struct extent_buffer *eb; 570 struct extent_buffer *eb;
548 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 571 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
549 int ret = 0; 572 int ret = 0;
573 int reads_done;
550 574
551 tree = &BTRFS_I(page->mapping->host)->io_tree;
552 if (page->private == EXTENT_PAGE_PRIVATE)
553 goto out;
554 if (!page->private) 575 if (!page->private)
555 goto out; 576 goto out;
556 577
578 tree = &BTRFS_I(page->mapping->host)->io_tree;
557 len = page->private >> 2; 579 len = page->private >> 2;
558 WARN_ON(len == 0);
559 580
560 eb = alloc_extent_buffer(tree, start, len, page); 581 eb = find_eb_for_page(tree, page, max(root->leafsize, root->nodesize));
561 if (eb == NULL) { 582 if (!eb) {
562 ret = -EIO; 583 ret = -EIO;
563 goto out; 584 goto out;
564 } 585 }
586 reads_done = atomic_dec_and_test(&eb->pages_reading);
587 if (!reads_done)
588 goto err;
565 589
566 found_start = btrfs_header_bytenr(eb); 590 found_start = btrfs_header_bytenr(eb);
567 if (found_start != start) { 591 if (found_start != eb->start) {
568 printk_ratelimited(KERN_INFO "btrfs bad tree block start " 592 printk_ratelimited(KERN_INFO "btrfs bad tree block start "
569 "%llu %llu\n", 593 "%llu %llu\n",
570 (unsigned long long)found_start, 594 (unsigned long long)found_start,
@@ -572,13 +596,6 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
572 ret = -EIO; 596 ret = -EIO;
573 goto err; 597 goto err;
574 } 598 }
575 if (eb->first_page != page) {
576 printk(KERN_INFO "btrfs bad first page %lu %lu\n",
577 eb->first_page->index, page->index);
578 WARN_ON(1);
579 ret = -EIO;
580 goto err;
581 }
582 if (check_tree_block_fsid(root, eb)) { 599 if (check_tree_block_fsid(root, eb)) {
583 printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", 600 printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
584 (unsigned long long)eb->start); 601 (unsigned long long)eb->start);
@@ -606,14 +623,14 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
606 ret = -EIO; 623 ret = -EIO;
607 } 624 }
608 625
609 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
610 end = eb->start + end - 1;
611err: 626err:
612 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { 627 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
613 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); 628 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
614 btree_readahead_hook(root, eb, eb->start, ret); 629 btree_readahead_hook(root, eb, eb->start, ret);
615 } 630 }
616 631
632 if (ret && eb)
633 clear_extent_buffer_uptodate(tree, eb, NULL);
617 free_extent_buffer(eb); 634 free_extent_buffer(eb);
618out: 635out:
619 return ret; 636 return ret;
@@ -637,7 +654,7 @@ static int btree_io_failed_hook(struct bio *failed_bio,
637 len = page->private >> 2; 654 len = page->private >> 2;
638 WARN_ON(len == 0); 655 WARN_ON(len == 0);
639 656
640 eb = alloc_extent_buffer(tree, start, len, page); 657 eb = alloc_extent_buffer(tree, start, len);
641 if (eb == NULL) 658 if (eb == NULL)
642 goto out; 659 goto out;
643 660
@@ -896,28 +913,14 @@ static int btree_migratepage(struct address_space *mapping,
896static int btree_writepage(struct page *page, struct writeback_control *wbc) 913static int btree_writepage(struct page *page, struct writeback_control *wbc)
897{ 914{
898 struct extent_io_tree *tree; 915 struct extent_io_tree *tree;
899 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
900 struct extent_buffer *eb;
901 int was_dirty;
902
903 tree = &BTRFS_I(page->mapping->host)->io_tree; 916 tree = &BTRFS_I(page->mapping->host)->io_tree;
917
904 if (!(current->flags & PF_MEMALLOC)) { 918 if (!(current->flags & PF_MEMALLOC)) {
905 return extent_write_full_page(tree, page, 919 return extent_write_full_page(tree, page,
906 btree_get_extent, wbc); 920 btree_get_extent, wbc);
907 } 921 }
908 922
909 redirty_page_for_writepage(wbc, page); 923 redirty_page_for_writepage(wbc, page);
910 eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
911 WARN_ON(!eb);
912
913 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
914 if (!was_dirty) {
915 spin_lock(&root->fs_info->delalloc_lock);
916 root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
917 spin_unlock(&root->fs_info->delalloc_lock);
918 }
919 free_extent_buffer(eb);
920
921 unlock_page(page); 924 unlock_page(page);
922 return 0; 925 return 0;
923} 926}
@@ -954,6 +957,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
954{ 957{
955 struct extent_io_tree *tree; 958 struct extent_io_tree *tree;
956 struct extent_map_tree *map; 959 struct extent_map_tree *map;
960 struct extent_buffer *eb;
961 struct btrfs_root *root;
957 int ret; 962 int ret;
958 963
959 if (PageWriteback(page) || PageDirty(page)) 964 if (PageWriteback(page) || PageDirty(page))
@@ -962,6 +967,13 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
962 tree = &BTRFS_I(page->mapping->host)->io_tree; 967 tree = &BTRFS_I(page->mapping->host)->io_tree;
963 map = &BTRFS_I(page->mapping->host)->extent_tree; 968 map = &BTRFS_I(page->mapping->host)->extent_tree;
964 969
970 root = BTRFS_I(page->mapping->host)->root;
971 if (page->private == EXTENT_PAGE_PRIVATE) {
972 eb = find_eb_for_page(tree, page, max(root->leafsize, root->nodesize));
973 free_extent_buffer(eb);
974 if (eb)
975 return 0;
976 }
965 /* 977 /*
966 * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing 978 * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
967 * slab allocation from alloc_extent_state down the callchain where 979 * slab allocation from alloc_extent_state down the callchain where
@@ -1074,20 +1086,20 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
1074 struct extent_buffer *eb; 1086 struct extent_buffer *eb;
1075 1087
1076 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, 1088 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
1077 bytenr, blocksize, NULL); 1089 bytenr, blocksize);
1078 return eb; 1090 return eb;
1079} 1091}
1080 1092
1081 1093
1082int btrfs_write_tree_block(struct extent_buffer *buf) 1094int btrfs_write_tree_block(struct extent_buffer *buf)
1083{ 1095{
1084 return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, 1096 return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
1085 buf->start + buf->len - 1); 1097 buf->start + buf->len - 1);
1086} 1098}
1087 1099
1088int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 1100int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
1089{ 1101{
1090 return filemap_fdatawait_range(buf->first_page->mapping, 1102 return filemap_fdatawait_range(buf->pages[0]->mapping,
1091 buf->start, buf->start + buf->len - 1); 1103 buf->start, buf->start + buf->len - 1);
1092} 1104}
1093 1105
@@ -1513,41 +1525,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1513 return 0; 1525 return 0;
1514} 1526}
1515 1527
1516static int bio_ready_for_csum(struct bio *bio)
1517{
1518 u64 length = 0;
1519 u64 buf_len = 0;
1520 u64 start = 0;
1521 struct page *page;
1522 struct extent_io_tree *io_tree = NULL;
1523 struct bio_vec *bvec;
1524 int i;
1525 int ret;
1526
1527 bio_for_each_segment(bvec, bio, i) {
1528 page = bvec->bv_page;
1529 if (page->private == EXTENT_PAGE_PRIVATE) {
1530 length += bvec->bv_len;
1531 continue;
1532 }
1533 if (!page->private) {
1534 length += bvec->bv_len;
1535 continue;
1536 }
1537 length = bvec->bv_len;
1538 buf_len = page->private >> 2;
1539 start = page_offset(page) + bvec->bv_offset;
1540 io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1541 }
1542 /* are we fully contained in this bio? */
1543 if (buf_len <= length)
1544 return 1;
1545
1546 ret = extent_range_uptodate(io_tree, start + length,
1547 start + buf_len - 1);
1548 return ret;
1549}
1550
1551/* 1528/*
1552 * called by the kthread helper functions to finally call the bio end_io 1529 * called by the kthread helper functions to finally call the bio end_io
1553 * functions. This is where read checksum verification actually happens 1530 * functions. This is where read checksum verification actually happens
@@ -1563,17 +1540,6 @@ static void end_workqueue_fn(struct btrfs_work *work)
1563 bio = end_io_wq->bio; 1540 bio = end_io_wq->bio;
1564 fs_info = end_io_wq->info; 1541 fs_info = end_io_wq->info;
1565 1542
1566 /* metadata bio reads are special because the whole tree block must
1567 * be checksummed at once. This makes sure the entire block is in
1568 * ram and up to date before trying to verify things. For
1569 * blocksize <= pagesize, it is basically a noop
1570 */
1571 if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
1572 !bio_ready_for_csum(bio)) {
1573 btrfs_queue_worker(&fs_info->endio_meta_workers,
1574 &end_io_wq->work);
1575 return;
1576 }
1577 error = end_io_wq->error; 1543 error = end_io_wq->error;
1578 bio->bi_private = end_io_wq->private; 1544 bio->bi_private = end_io_wq->private;
1579 bio->bi_end_io = end_io_wq->end_io; 1545 bio->bi_end_io = end_io_wq->end_io;
@@ -2135,10 +2101,38 @@ int open_ctree(struct super_block *sb,
2135 goto fail_alloc; 2101 goto fail_alloc;
2136 } 2102 }
2137 2103
2104 if (btrfs_super_leafsize(disk_super) !=
2105 btrfs_super_nodesize(disk_super)) {
2106 printk(KERN_ERR "BTRFS: couldn't mount because metadata "
2107 "blocksizes don't match. node %d leaf %d\n",
2108 btrfs_super_nodesize(disk_super),
2109 btrfs_super_leafsize(disk_super));
2110 err = -EINVAL;
2111 goto fail_alloc;
2112 }
2113 if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
2114 printk(KERN_ERR "BTRFS: couldn't mount because metadata "
2115 "blocksize (%d) was too large\n",
2116 btrfs_super_leafsize(disk_super));
2117 err = -EINVAL;
2118 goto fail_alloc;
2119 }
2120
2138 features = btrfs_super_incompat_flags(disk_super); 2121 features = btrfs_super_incompat_flags(disk_super);
2139 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; 2122 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
2140 if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) 2123 if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
2141 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 2124 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
2125
2126 /*
2127 * flag our filesystem as having big metadata blocks if
2128 * they are bigger than the page size
2129 */
2130 if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
2131 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
2132 printk(KERN_INFO "btrfs flagging fs with big metadata feature\n");
2133 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
2134 }
2135
2142 btrfs_set_super_incompat_flags(disk_super, features); 2136 btrfs_set_super_incompat_flags(disk_super, features);
2143 2137
2144 features = btrfs_super_compat_ro_flags(disk_super) & 2138 features = btrfs_super_compat_ro_flags(disk_super) &
@@ -3122,7 +3116,7 @@ int close_ctree(struct btrfs_root *root)
3122int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) 3116int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
3123{ 3117{
3124 int ret; 3118 int ret;
3125 struct inode *btree_inode = buf->first_page->mapping->host; 3119 struct inode *btree_inode = buf->pages[0]->mapping->host;
3126 3120
3127 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf, 3121 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
3128 NULL); 3122 NULL);
@@ -3136,14 +3130,14 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
3136 3130
3137int btrfs_set_buffer_uptodate(struct extent_buffer *buf) 3131int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
3138{ 3132{
3139 struct inode *btree_inode = buf->first_page->mapping->host; 3133 struct inode *btree_inode = buf->pages[0]->mapping->host;
3140 return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, 3134 return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
3141 buf); 3135 buf);
3142} 3136}
3143 3137
3144void btrfs_mark_buffer_dirty(struct extent_buffer *buf) 3138void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3145{ 3139{
3146 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 3140 struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
3147 u64 transid = btrfs_header_generation(buf); 3141 u64 transid = btrfs_header_generation(buf);
3148 struct inode *btree_inode = root->fs_info->btree_inode; 3142 struct inode *btree_inode = root->fs_info->btree_inode;
3149 int was_dirty; 3143 int was_dirty;
@@ -3212,7 +3206,7 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3212 3206
3213int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) 3207int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
3214{ 3208{
3215 struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; 3209 struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
3216 int ret; 3210 int ret;
3217 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); 3211 ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
3218 if (ret == 0) 3212 if (ret == 0)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a55fbe6252de..c6c9ce463c86 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3548,26 +3548,7 @@ out:
3548inline struct page *extent_buffer_page(struct extent_buffer *eb, 3548inline struct page *extent_buffer_page(struct extent_buffer *eb,
3549 unsigned long i) 3549 unsigned long i)
3550{ 3550{
3551 struct page *p; 3551 return eb->pages[i];
3552 struct address_space *mapping;
3553
3554 if (i == 0)
3555 return eb->first_page;
3556 i += eb->start >> PAGE_CACHE_SHIFT;
3557 mapping = eb->first_page->mapping;
3558 if (!mapping)
3559 return NULL;
3560
3561 /*
3562 * extent_buffer_page is only called after pinning the page
3563 * by increasing the reference count. So we know the page must
3564 * be in the radix tree.
3565 */
3566 rcu_read_lock();
3567 p = radix_tree_lookup(&mapping->page_tree, i);
3568 rcu_read_unlock();
3569
3570 return p;
3571} 3552}
3572 3553
3573inline unsigned long num_extent_pages(u64 start, u64 len) 3554inline unsigned long num_extent_pages(u64 start, u64 len)
@@ -3576,6 +3557,19 @@ inline unsigned long num_extent_pages(u64 start, u64 len)
3576 (start >> PAGE_CACHE_SHIFT); 3557 (start >> PAGE_CACHE_SHIFT);
3577} 3558}
3578 3559
3560static void __free_extent_buffer(struct extent_buffer *eb)
3561{
3562#if LEAK_DEBUG
3563 unsigned long flags;
3564 spin_lock_irqsave(&leak_lock, flags);
3565 list_del(&eb->leak_list);
3566 spin_unlock_irqrestore(&leak_lock, flags);
3567#endif
3568 if (eb->pages && eb->pages != eb->inline_pages)
3569 kfree(eb->pages);
3570 kmem_cache_free(extent_buffer_cache, eb);
3571}
3572
3579static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 3573static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3580 u64 start, 3574 u64 start,
3581 unsigned long len, 3575 unsigned long len,
@@ -3608,21 +3602,25 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3608 spin_unlock_irqrestore(&leak_lock, flags); 3602 spin_unlock_irqrestore(&leak_lock, flags);
3609#endif 3603#endif
3610 atomic_set(&eb->refs, 1); 3604 atomic_set(&eb->refs, 1);
3605 atomic_set(&eb->pages_reading, 0);
3606
3607 if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
3608 struct page **pages;
3609 int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
3610 PAGE_CACHE_SHIFT;
3611 pages = kzalloc(num_pages, mask);
3612 if (!pages) {
3613 __free_extent_buffer(eb);
3614 return NULL;
3615 }
3616 eb->pages = pages;
3617 } else {
3618 eb->pages = eb->inline_pages;
3619 }
3611 3620
3612 return eb; 3621 return eb;
3613} 3622}
3614 3623
3615static void __free_extent_buffer(struct extent_buffer *eb)
3616{
3617#if LEAK_DEBUG
3618 unsigned long flags;
3619 spin_lock_irqsave(&leak_lock, flags);
3620 list_del(&eb->leak_list);
3621 spin_unlock_irqrestore(&leak_lock, flags);
3622#endif
3623 kmem_cache_free(extent_buffer_cache, eb);
3624}
3625
3626/* 3624/*
3627 * Helper for releasing extent buffer page. 3625 * Helper for releasing extent buffer page.
3628 */ 3626 */
@@ -3632,9 +3630,6 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3632 unsigned long index; 3630 unsigned long index;
3633 struct page *page; 3631 struct page *page;
3634 3632
3635 if (!eb->first_page)
3636 return;
3637
3638 index = num_extent_pages(eb->start, eb->len); 3633 index = num_extent_pages(eb->start, eb->len);
3639 if (start_idx >= index) 3634 if (start_idx >= index)
3640 return; 3635 return;
@@ -3657,8 +3652,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3657} 3652}
3658 3653
3659struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 3654struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3660 u64 start, unsigned long len, 3655 u64 start, unsigned long len)
3661 struct page *page0)
3662{ 3656{
3663 unsigned long num_pages = num_extent_pages(start, len); 3657 unsigned long num_pages = num_extent_pages(start, len);
3664 unsigned long i; 3658 unsigned long i;
@@ -3674,7 +3668,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3674 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3668 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3675 if (eb && atomic_inc_not_zero(&eb->refs)) { 3669 if (eb && atomic_inc_not_zero(&eb->refs)) {
3676 rcu_read_unlock(); 3670 rcu_read_unlock();
3677 mark_page_accessed(eb->first_page); 3671 mark_page_accessed(eb->pages[0]);
3678 return eb; 3672 return eb;
3679 } 3673 }
3680 rcu_read_unlock(); 3674 rcu_read_unlock();
@@ -3683,32 +3677,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3683 if (!eb) 3677 if (!eb)
3684 return NULL; 3678 return NULL;
3685 3679
3686 if (page0) { 3680 for (i = 0; i < num_pages; i++, index++) {
3687 eb->first_page = page0;
3688 i = 1;
3689 index++;
3690 page_cache_get(page0);
3691 mark_page_accessed(page0);
3692 set_page_extent_mapped(page0);
3693 set_page_extent_head(page0, len);
3694 uptodate = PageUptodate(page0);
3695 } else {
3696 i = 0;
3697 }
3698 for (; i < num_pages; i++, index++) {
3699 p = find_or_create_page(mapping, index, GFP_NOFS); 3681 p = find_or_create_page(mapping, index, GFP_NOFS);
3700 if (!p) { 3682 if (!p) {
3701 WARN_ON(1); 3683 WARN_ON(1);
3702 goto free_eb; 3684 goto free_eb;
3703 } 3685 }
3704 set_page_extent_mapped(p);
3705 mark_page_accessed(p); 3686 mark_page_accessed(p);
3706 if (i == 0) { 3687 eb->pages[i] = p;
3707 eb->first_page = p;
3708 set_page_extent_head(p, len);
3709 } else {
3710 set_page_private(p, EXTENT_PAGE_PRIVATE);
3711 }
3712 if (!PageUptodate(p)) 3688 if (!PageUptodate(p))
3713 uptodate = 0; 3689 uptodate = 0;
3714 3690
@@ -3716,8 +3692,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3716 * see below about how we avoid a nasty race with release page 3692 * see below about how we avoid a nasty race with release page
3717 * and why we unlock later 3693 * and why we unlock later
3718 */ 3694 */
3719 if (i != 0)
3720 unlock_page(p);
3721 } 3695 }
3722 if (uptodate) 3696 if (uptodate)
3723 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3697 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -3751,15 +3725,23 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3751 * after the extent buffer is in the radix tree so 3725 * after the extent buffer is in the radix tree so
3752 * it doesn't get lost 3726 * it doesn't get lost
3753 */ 3727 */
3754 set_page_extent_mapped(eb->first_page); 3728 set_page_extent_mapped(eb->pages[0]);
3755 set_page_extent_head(eb->first_page, eb->len); 3729 set_page_extent_head(eb->pages[0], eb->len);
3756 if (!page0) 3730 SetPageChecked(eb->pages[0]);
3757 unlock_page(eb->first_page); 3731 for (i = 1; i < num_pages; i++) {
3732 p = extent_buffer_page(eb, i);
3733 set_page_extent_mapped(p);
3734 ClearPageChecked(p);
3735 unlock_page(p);
3736 }
3737 unlock_page(eb->pages[0]);
3758 return eb; 3738 return eb;
3759 3739
3760free_eb: 3740free_eb:
3761 if (eb->first_page && !page0) 3741 for (i = 0; i < num_pages; i++) {
3762 unlock_page(eb->first_page); 3742 if (eb->pages[i])
3743 unlock_page(eb->pages[i]);
3744 }
3763 3745
3764 if (!atomic_dec_and_test(&eb->refs)) 3746 if (!atomic_dec_and_test(&eb->refs))
3765 return exists; 3747 return exists;
@@ -3776,7 +3758,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3776 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3758 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3777 if (eb && atomic_inc_not_zero(&eb->refs)) { 3759 if (eb && atomic_inc_not_zero(&eb->refs)) {
3778 rcu_read_unlock(); 3760 rcu_read_unlock();
3779 mark_page_accessed(eb->first_page); 3761 mark_page_accessed(eb->pages[0]);
3780 return eb; 3762 return eb;
3781 } 3763 }
3782 rcu_read_unlock(); 3764 rcu_read_unlock();
@@ -3981,8 +3963,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3981 int ret = 0; 3963 int ret = 0;
3982 int locked_pages = 0; 3964 int locked_pages = 0;
3983 int all_uptodate = 1; 3965 int all_uptodate = 1;
3984 int inc_all_pages = 0;
3985 unsigned long num_pages; 3966 unsigned long num_pages;
3967 unsigned long num_reads = 0;
3986 struct bio *bio = NULL; 3968 struct bio *bio = NULL;
3987 unsigned long bio_flags = 0; 3969 unsigned long bio_flags = 0;
3988 3970
@@ -4014,8 +3996,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
4014 lock_page(page); 3996 lock_page(page);
4015 } 3997 }
4016 locked_pages++; 3998 locked_pages++;
4017 if (!PageUptodate(page)) 3999 if (!PageUptodate(page)) {
4000 num_reads++;
4018 all_uptodate = 0; 4001 all_uptodate = 0;
4002 }
4019 } 4003 }
4020 if (all_uptodate) { 4004 if (all_uptodate) {
4021 if (start_i == 0) 4005 if (start_i == 0)
@@ -4023,20 +4007,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
4023 goto unlock_exit; 4007 goto unlock_exit;
4024 } 4008 }
4025 4009
4010 atomic_set(&eb->pages_reading, num_reads);
4026 for (i = start_i; i < num_pages; i++) { 4011 for (i = start_i; i < num_pages; i++) {
4027 page = extent_buffer_page(eb, i); 4012 page = extent_buffer_page(eb, i);
4028
4029 WARN_ON(!PagePrivate(page));
4030
4031 set_page_extent_mapped(page); 4013 set_page_extent_mapped(page);
4032 if (i == 0) 4014 if (i == 0)
4033 set_page_extent_head(page, eb->len); 4015 set_page_extent_head(page, eb->len);
4034
4035 if (inc_all_pages)
4036 page_cache_get(page);
4037 if (!PageUptodate(page)) { 4016 if (!PageUptodate(page)) {
4038 if (start_i == 0)
4039 inc_all_pages = 1;
4040 ClearPageError(page); 4017 ClearPageError(page);
4041 err = __extent_read_full_page(tree, page, 4018 err = __extent_read_full_page(tree, page,
4042 get_extent, &bio, 4019 get_extent, &bio,
@@ -4304,15 +4281,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
4304{ 4281{
4305 char *dst_kaddr = page_address(dst_page); 4282 char *dst_kaddr = page_address(dst_page);
4306 char *src_kaddr; 4283 char *src_kaddr;
4284 int must_memmove = 0;
4307 4285
4308 if (dst_page != src_page) { 4286 if (dst_page != src_page) {
4309 src_kaddr = page_address(src_page); 4287 src_kaddr = page_address(src_page);
4310 } else { 4288 } else {
4311 src_kaddr = dst_kaddr; 4289 src_kaddr = dst_kaddr;
4312 BUG_ON(areas_overlap(src_off, dst_off, len)); 4290 if (areas_overlap(src_off, dst_off, len))
4291 must_memmove = 1;
4313 } 4292 }
4314 4293
4315 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 4294 if (must_memmove)
4295 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
4296 else
4297 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
4316} 4298}
4317 4299
4318void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 4300void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
@@ -4382,7 +4364,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4382 "len %lu len %lu\n", dst_offset, len, dst->len); 4364 "len %lu len %lu\n", dst_offset, len, dst->len);
4383 BUG_ON(1); 4365 BUG_ON(1);
4384 } 4366 }
4385 if (!areas_overlap(src_offset, dst_offset, len)) { 4367 if (dst_offset < src_offset) {
4386 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 4368 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4387 return; 4369 return;
4388 } 4370 }
@@ -4429,7 +4411,8 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
4429 return ret; 4411 return ret;
4430 } 4412 }
4431 4413
4432 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 4414 if (atomic_read(&eb->refs) > 1 ||
4415 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4433 ret = 0; 4416 ret = 0;
4434 goto out; 4417 goto out;
4435 } 4418 }
@@ -4442,7 +4425,6 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
4442 ret = 0; 4425 ret = 0;
4443 goto out; 4426 goto out;
4444 } 4427 }
4445
4446 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4428 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4447out: 4429out:
4448 spin_unlock(&tree->buffer_lock); 4430 spin_unlock(&tree->buffer_lock);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index cecc3518c121..4e38a3d9631a 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -119,16 +119,18 @@ struct extent_state {
119 struct list_head leak_list; 119 struct list_head leak_list;
120}; 120};
121 121
122#define INLINE_EXTENT_BUFFER_PAGES 16
123#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE)
122struct extent_buffer { 124struct extent_buffer {
123 u64 start; 125 u64 start;
124 unsigned long len; 126 unsigned long len;
125 unsigned long map_start; 127 unsigned long map_start;
126 unsigned long map_len; 128 unsigned long map_len;
127 struct page *first_page;
128 unsigned long bflags; 129 unsigned long bflags;
130 atomic_t refs;
131 atomic_t pages_reading;
129 struct list_head leak_list; 132 struct list_head leak_list;
130 struct rcu_head rcu_head; 133 struct rcu_head rcu_head;
131 atomic_t refs;
132 pid_t lock_owner; 134 pid_t lock_owner;
133 135
134 /* count of read lock holders on the extent buffer */ 136 /* count of read lock holders on the extent buffer */
@@ -152,6 +154,9 @@ struct extent_buffer {
152 * to unlock 154 * to unlock
153 */ 155 */
154 wait_queue_head_t read_lock_wq; 156 wait_queue_head_t read_lock_wq;
157 wait_queue_head_t lock_wq;
158 struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES];
159 struct page **pages;
155}; 160};
156 161
157static inline void extent_set_compress_type(unsigned long *bio_flags, 162static inline void extent_set_compress_type(unsigned long *bio_flags,
@@ -251,8 +256,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
251void set_page_extent_mapped(struct page *page); 256void set_page_extent_mapped(struct page *page);
252 257
253struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 258struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
254 u64 start, unsigned long len, 259 u64 start, unsigned long len);
255 struct page *page0);
256struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 260struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
257 u64 start, unsigned long len); 261 u64 start, unsigned long len);
258void free_extent_buffer(struct extent_buffer *eb); 262void free_extent_buffer(struct extent_buffer *eb);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index baa74f3db691..6ea71c60e80a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -19,6 +19,7 @@
19#include "ctree.h" 19#include "ctree.h"
20#include "disk-io.h" 20#include "disk-io.h"
21#include "transaction.h" 21#include "transaction.h"
22#include "print-tree.h"
22 23
23static int find_name_in_backref(struct btrfs_path *path, const char *name, 24static int find_name_in_backref(struct btrfs_path *path, const char *name,
24 int name_len, struct btrfs_inode_ref **ref_ret) 25 int name_len, struct btrfs_inode_ref **ref_ret)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ef41f285a475..58aad63e1ad3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4384,7 +4384,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
4384 * to silence the warning eg. on PowerPC 64. 4384 * to silence the warning eg. on PowerPC 64.
4385 */ 4385 */
4386 if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) 4386 if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
4387 SetPageUptodate(sb->first_page); 4387 SetPageUptodate(sb->pages[0]);
4388 4388
4389 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 4389 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
4390 array_size = btrfs_super_sys_array_size(super_copy); 4390 array_size = btrfs_super_sys_array_size(super_copy);