aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-03-30 15:44:29 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-30 15:44:29 -0400
commit9613bebb223dea3179c265dc31e1bb41ae39f321 (patch)
tree39bf883573d23775a53be3172323c0237fef5630 /fs/btrfs/extent_io.c
parent40380f1c7841a5dcbf0b20f0b6da11969211ef77 (diff)
parentbc3f116fec194f1d7329b160c266fe16b9266a1e (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs fixes and features from Chris Mason: "We've merged in the error handling patches from SuSE. These are already shipping in the sles kernel, and they give btrfs the ability to abort transactions and go readonly on errors. It involves a lot of churn as they clarify BUG_ONs, and remove the ones we now properly deal with. Josef reworked the way our metadata interacts with the page cache. page->private now points to the btrfs extent_buffer object, which makes everything faster. He changed it so we write an whole extent buffer at a time instead of allowing individual pages to go down,, which will be important for the raid5/6 code (for the 3.5 merge window ;) Josef also made us more aggressive about dropping pages for metadata blocks that were freed due to COW. Overall, our metadata caching is much faster now. We've integrated my patch for metadata bigger than the page size. This allows metadata blocks up to 64KB in size. In practice 16K and 32K seem to work best. For workloads with lots of metadata, this cuts down the size of the extent allocation tree dramatically and fragments much less. Scrub was updated to support the larger block sizes, which ended up being a fairly large change (thanks Stefan Behrens). We also have an assortment of fixes and updates, especially to the balancing code (Ilya Dryomov), the back ref walker (Jan Schmidt) and the defragging code (Liu Bo)." Fixed up trivial conflicts in fs/btrfs/scrub.c that were just due to removal of the second argument to k[un]map_atomic() in commit 7ac687d9e047. * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (75 commits) Btrfs: update the checks for mixed block groups with big metadata blocks Btrfs: update to the right index of defragment Btrfs: do not bother to defrag an extent if it is a big real extent Btrfs: add a check to decide if we should defrag the range Btrfs: fix recursive defragment with autodefrag option Btrfs: fix the mismatch of page->mapping Btrfs: fix race between direct io and autodefrag Btrfs: fix deadlock during allocating chunks Btrfs: show useful info in space reservation tracepoint Btrfs: don't use crc items bigger than 4KB Btrfs: flush out and clean up any block device pages during mount btrfs: disallow unequal data/metadata blocksize for mixed block groups Btrfs: enhance superblock sanity checks Btrfs: change scrub to support big blocks Btrfs: minor cleanup in scrub Btrfs: introduce common define for max number of mirrors Btrfs: fix infinite loop in btrfs_shrink_device() Btrfs: fix memory leak in resolver code Btrfs: allow dup for data chunks in mixed mode Btrfs: validate target profiles only if we are going to use them ...
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c1035
1 files changed, 734 insertions, 301 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2862454bcdb3..8d904dd7ea9f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -19,6 +19,7 @@
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h" 20#include "volumes.h"
21#include "check-integrity.h" 21#include "check-integrity.h"
22#include "locking.h"
22 23
23static struct kmem_cache *extent_state_cache; 24static struct kmem_cache *extent_state_cache;
24static struct kmem_cache *extent_buffer_cache; 25static struct kmem_cache *extent_buffer_cache;
@@ -53,6 +54,13 @@ struct extent_page_data {
53 unsigned int sync_io:1; 54 unsigned int sync_io:1;
54}; 55};
55 56
57static noinline void flush_write_bio(void *data);
58static inline struct btrfs_fs_info *
59tree_fs_info(struct extent_io_tree *tree)
60{
61 return btrfs_sb(tree->mapping->host->i_sb);
62}
63
56int __init extent_io_init(void) 64int __init extent_io_init(void)
57{ 65{
58 extent_state_cache = kmem_cache_create("extent_state", 66 extent_state_cache = kmem_cache_create("extent_state",
@@ -136,6 +144,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
136#endif 144#endif
137 atomic_set(&state->refs, 1); 145 atomic_set(&state->refs, 1);
138 init_waitqueue_head(&state->wq); 146 init_waitqueue_head(&state->wq);
147 trace_alloc_extent_state(state, mask, _RET_IP_);
139 return state; 148 return state;
140} 149}
141 150
@@ -153,6 +162,7 @@ void free_extent_state(struct extent_state *state)
153 list_del(&state->leak_list); 162 list_del(&state->leak_list);
154 spin_unlock_irqrestore(&leak_lock, flags); 163 spin_unlock_irqrestore(&leak_lock, flags);
155#endif 164#endif
165 trace_free_extent_state(state, _RET_IP_);
156 kmem_cache_free(extent_state_cache, state); 166 kmem_cache_free(extent_state_cache, state);
157 } 167 }
158} 168}
@@ -439,6 +449,13 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
439 return prealloc; 449 return prealloc;
440} 450}
441 451
452void extent_io_tree_panic(struct extent_io_tree *tree, int err)
453{
454 btrfs_panic(tree_fs_info(tree), err, "Locking error: "
455 "Extent tree was modified by another "
456 "thread while locked.");
457}
458
442/* 459/*
443 * clear some bits on a range in the tree. This may require splitting 460 * clear some bits on a range in the tree. This may require splitting
444 * or inserting elements in the tree, so the gfp mask is used to 461 * or inserting elements in the tree, so the gfp mask is used to
@@ -449,8 +466,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
449 * 466 *
450 * the range [start, end] is inclusive. 467 * the range [start, end] is inclusive.
451 * 468 *
452 * This takes the tree lock, and returns < 0 on error, > 0 if any of the 469 * This takes the tree lock, and returns 0 on success and < 0 on error.
453 * bits were already set, or zero if none of the bits were already set.
454 */ 470 */
455int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 471int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
456 int bits, int wake, int delete, 472 int bits, int wake, int delete,
@@ -464,7 +480,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
464 struct rb_node *node; 480 struct rb_node *node;
465 u64 last_end; 481 u64 last_end;
466 int err; 482 int err;
467 int set = 0;
468 int clear = 0; 483 int clear = 0;
469 484
470 if (delete) 485 if (delete)
@@ -542,12 +557,14 @@ hit_next:
542 prealloc = alloc_extent_state_atomic(prealloc); 557 prealloc = alloc_extent_state_atomic(prealloc);
543 BUG_ON(!prealloc); 558 BUG_ON(!prealloc);
544 err = split_state(tree, state, prealloc, start); 559 err = split_state(tree, state, prealloc, start);
545 BUG_ON(err == -EEXIST); 560 if (err)
561 extent_io_tree_panic(tree, err);
562
546 prealloc = NULL; 563 prealloc = NULL;
547 if (err) 564 if (err)
548 goto out; 565 goto out;
549 if (state->end <= end) { 566 if (state->end <= end) {
550 set |= clear_state_bit(tree, state, &bits, wake); 567 clear_state_bit(tree, state, &bits, wake);
551 if (last_end == (u64)-1) 568 if (last_end == (u64)-1)
552 goto out; 569 goto out;
553 start = last_end + 1; 570 start = last_end + 1;
@@ -564,17 +581,19 @@ hit_next:
564 prealloc = alloc_extent_state_atomic(prealloc); 581 prealloc = alloc_extent_state_atomic(prealloc);
565 BUG_ON(!prealloc); 582 BUG_ON(!prealloc);
566 err = split_state(tree, state, prealloc, end + 1); 583 err = split_state(tree, state, prealloc, end + 1);
567 BUG_ON(err == -EEXIST); 584 if (err)
585 extent_io_tree_panic(tree, err);
586
568 if (wake) 587 if (wake)
569 wake_up(&state->wq); 588 wake_up(&state->wq);
570 589
571 set |= clear_state_bit(tree, prealloc, &bits, wake); 590 clear_state_bit(tree, prealloc, &bits, wake);
572 591
573 prealloc = NULL; 592 prealloc = NULL;
574 goto out; 593 goto out;
575 } 594 }
576 595
577 set |= clear_state_bit(tree, state, &bits, wake); 596 clear_state_bit(tree, state, &bits, wake);
578next: 597next:
579 if (last_end == (u64)-1) 598 if (last_end == (u64)-1)
580 goto out; 599 goto out;
@@ -591,7 +610,7 @@ out:
591 if (prealloc) 610 if (prealloc)
592 free_extent_state(prealloc); 611 free_extent_state(prealloc);
593 612
594 return set; 613 return 0;
595 614
596search_again: 615search_again:
597 if (start > end) 616 if (start > end)
@@ -602,8 +621,8 @@ search_again:
602 goto again; 621 goto again;
603} 622}
604 623
605static int wait_on_state(struct extent_io_tree *tree, 624static void wait_on_state(struct extent_io_tree *tree,
606 struct extent_state *state) 625 struct extent_state *state)
607 __releases(tree->lock) 626 __releases(tree->lock)
608 __acquires(tree->lock) 627 __acquires(tree->lock)
609{ 628{
@@ -613,7 +632,6 @@ static int wait_on_state(struct extent_io_tree *tree,
613 schedule(); 632 schedule();
614 spin_lock(&tree->lock); 633 spin_lock(&tree->lock);
615 finish_wait(&state->wq, &wait); 634 finish_wait(&state->wq, &wait);
616 return 0;
617} 635}
618 636
619/* 637/*
@@ -621,7 +639,7 @@ static int wait_on_state(struct extent_io_tree *tree,
621 * The range [start, end] is inclusive. 639 * The range [start, end] is inclusive.
622 * The tree lock is taken by this function 640 * The tree lock is taken by this function
623 */ 641 */
624int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) 642void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
625{ 643{
626 struct extent_state *state; 644 struct extent_state *state;
627 struct rb_node *node; 645 struct rb_node *node;
@@ -658,7 +676,6 @@ again:
658 } 676 }
659out: 677out:
660 spin_unlock(&tree->lock); 678 spin_unlock(&tree->lock);
661 return 0;
662} 679}
663 680
664static void set_state_bits(struct extent_io_tree *tree, 681static void set_state_bits(struct extent_io_tree *tree,
@@ -706,9 +723,10 @@ static void uncache_state(struct extent_state **cached_ptr)
706 * [start, end] is inclusive This takes the tree lock. 723 * [start, end] is inclusive This takes the tree lock.
707 */ 724 */
708 725
709int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 726static int __must_check
710 int bits, int exclusive_bits, u64 *failed_start, 727__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
711 struct extent_state **cached_state, gfp_t mask) 728 int bits, int exclusive_bits, u64 *failed_start,
729 struct extent_state **cached_state, gfp_t mask)
712{ 730{
713 struct extent_state *state; 731 struct extent_state *state;
714 struct extent_state *prealloc = NULL; 732 struct extent_state *prealloc = NULL;
@@ -742,8 +760,10 @@ again:
742 prealloc = alloc_extent_state_atomic(prealloc); 760 prealloc = alloc_extent_state_atomic(prealloc);
743 BUG_ON(!prealloc); 761 BUG_ON(!prealloc);
744 err = insert_state(tree, prealloc, start, end, &bits); 762 err = insert_state(tree, prealloc, start, end, &bits);
763 if (err)
764 extent_io_tree_panic(tree, err);
765
745 prealloc = NULL; 766 prealloc = NULL;
746 BUG_ON(err == -EEXIST);
747 goto out; 767 goto out;
748 } 768 }
749 state = rb_entry(node, struct extent_state, rb_node); 769 state = rb_entry(node, struct extent_state, rb_node);
@@ -809,7 +829,9 @@ hit_next:
809 prealloc = alloc_extent_state_atomic(prealloc); 829 prealloc = alloc_extent_state_atomic(prealloc);
810 BUG_ON(!prealloc); 830 BUG_ON(!prealloc);
811 err = split_state(tree, state, prealloc, start); 831 err = split_state(tree, state, prealloc, start);
812 BUG_ON(err == -EEXIST); 832 if (err)
833 extent_io_tree_panic(tree, err);
834
813 prealloc = NULL; 835 prealloc = NULL;
814 if (err) 836 if (err)
815 goto out; 837 goto out;
@@ -846,12 +868,9 @@ hit_next:
846 */ 868 */
847 err = insert_state(tree, prealloc, start, this_end, 869 err = insert_state(tree, prealloc, start, this_end,
848 &bits); 870 &bits);
849 BUG_ON(err == -EEXIST); 871 if (err)
850 if (err) { 872 extent_io_tree_panic(tree, err);
851 free_extent_state(prealloc); 873
852 prealloc = NULL;
853 goto out;
854 }
855 cache_state(prealloc, cached_state); 874 cache_state(prealloc, cached_state);
856 prealloc = NULL; 875 prealloc = NULL;
857 start = this_end + 1; 876 start = this_end + 1;
@@ -873,7 +892,8 @@ hit_next:
873 prealloc = alloc_extent_state_atomic(prealloc); 892 prealloc = alloc_extent_state_atomic(prealloc);
874 BUG_ON(!prealloc); 893 BUG_ON(!prealloc);
875 err = split_state(tree, state, prealloc, end + 1); 894 err = split_state(tree, state, prealloc, end + 1);
876 BUG_ON(err == -EEXIST); 895 if (err)
896 extent_io_tree_panic(tree, err);
877 897
878 set_state_bits(tree, prealloc, &bits); 898 set_state_bits(tree, prealloc, &bits);
879 cache_state(prealloc, cached_state); 899 cache_state(prealloc, cached_state);
@@ -900,6 +920,15 @@ search_again:
900 goto again; 920 goto again;
901} 921}
902 922
923int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
924 u64 *failed_start, struct extent_state **cached_state,
925 gfp_t mask)
926{
927 return __set_extent_bit(tree, start, end, bits, 0, failed_start,
928 cached_state, mask);
929}
930
931
903/** 932/**
904 * convert_extent - convert all bits in a given range from one bit to another 933 * convert_extent - convert all bits in a given range from one bit to another
905 * @tree: the io tree to search 934 * @tree: the io tree to search
@@ -946,7 +975,8 @@ again:
946 } 975 }
947 err = insert_state(tree, prealloc, start, end, &bits); 976 err = insert_state(tree, prealloc, start, end, &bits);
948 prealloc = NULL; 977 prealloc = NULL;
949 BUG_ON(err == -EEXIST); 978 if (err)
979 extent_io_tree_panic(tree, err);
950 goto out; 980 goto out;
951 } 981 }
952 state = rb_entry(node, struct extent_state, rb_node); 982 state = rb_entry(node, struct extent_state, rb_node);
@@ -1002,7 +1032,8 @@ hit_next:
1002 goto out; 1032 goto out;
1003 } 1033 }
1004 err = split_state(tree, state, prealloc, start); 1034 err = split_state(tree, state, prealloc, start);
1005 BUG_ON(err == -EEXIST); 1035 if (err)
1036 extent_io_tree_panic(tree, err);
1006 prealloc = NULL; 1037 prealloc = NULL;
1007 if (err) 1038 if (err)
1008 goto out; 1039 goto out;
@@ -1041,12 +1072,8 @@ hit_next:
1041 */ 1072 */
1042 err = insert_state(tree, prealloc, start, this_end, 1073 err = insert_state(tree, prealloc, start, this_end,
1043 &bits); 1074 &bits);
1044 BUG_ON(err == -EEXIST); 1075 if (err)
1045 if (err) { 1076 extent_io_tree_panic(tree, err);
1046 free_extent_state(prealloc);
1047 prealloc = NULL;
1048 goto out;
1049 }
1050 prealloc = NULL; 1077 prealloc = NULL;
1051 start = this_end + 1; 1078 start = this_end + 1;
1052 goto search_again; 1079 goto search_again;
@@ -1065,7 +1092,8 @@ hit_next:
1065 } 1092 }
1066 1093
1067 err = split_state(tree, state, prealloc, end + 1); 1094 err = split_state(tree, state, prealloc, end + 1);
1068 BUG_ON(err == -EEXIST); 1095 if (err)
1096 extent_io_tree_panic(tree, err);
1069 1097
1070 set_state_bits(tree, prealloc, &bits); 1098 set_state_bits(tree, prealloc, &bits);
1071 clear_state_bit(tree, prealloc, &clear_bits, 0); 1099 clear_state_bit(tree, prealloc, &clear_bits, 0);
@@ -1095,14 +1123,14 @@ search_again:
1095int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1123int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1096 gfp_t mask) 1124 gfp_t mask)
1097{ 1125{
1098 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 1126 return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
1099 NULL, mask); 1127 NULL, mask);
1100} 1128}
1101 1129
1102int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1130int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1103 int bits, gfp_t mask) 1131 int bits, gfp_t mask)
1104{ 1132{
1105 return set_extent_bit(tree, start, end, bits, 0, NULL, 1133 return set_extent_bit(tree, start, end, bits, NULL,
1106 NULL, mask); 1134 NULL, mask);
1107} 1135}
1108 1136
@@ -1117,7 +1145,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1117{ 1145{
1118 return set_extent_bit(tree, start, end, 1146 return set_extent_bit(tree, start, end,
1119 EXTENT_DELALLOC | EXTENT_UPTODATE, 1147 EXTENT_DELALLOC | EXTENT_UPTODATE,
1120 0, NULL, cached_state, mask); 1148 NULL, cached_state, mask);
1121} 1149}
1122 1150
1123int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1151int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1131,7 +1159,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1131int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 1159int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1132 gfp_t mask) 1160 gfp_t mask)
1133{ 1161{
1134 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 1162 return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
1135 NULL, mask); 1163 NULL, mask);
1136} 1164}
1137 1165
@@ -1139,7 +1167,7 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1139 struct extent_state **cached_state, gfp_t mask) 1167 struct extent_state **cached_state, gfp_t mask)
1140{ 1168{
1141 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 1169 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
1142 NULL, cached_state, mask); 1170 cached_state, mask);
1143} 1171}
1144 1172
1145static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 1173static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -1155,42 +1183,40 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
1155 * us if waiting is desired. 1183 * us if waiting is desired.
1156 */ 1184 */
1157int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1185int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1158 int bits, struct extent_state **cached_state, gfp_t mask) 1186 int bits, struct extent_state **cached_state)
1159{ 1187{
1160 int err; 1188 int err;
1161 u64 failed_start; 1189 u64 failed_start;
1162 while (1) { 1190 while (1) {
1163 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 1191 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1164 EXTENT_LOCKED, &failed_start, 1192 EXTENT_LOCKED, &failed_start,
1165 cached_state, mask); 1193 cached_state, GFP_NOFS);
1166 if (err == -EEXIST && (mask & __GFP_WAIT)) { 1194 if (err == -EEXIST) {
1167 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1195 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1168 start = failed_start; 1196 start = failed_start;
1169 } else { 1197 } else
1170 break; 1198 break;
1171 }
1172 WARN_ON(start > end); 1199 WARN_ON(start > end);
1173 } 1200 }
1174 return err; 1201 return err;
1175} 1202}
1176 1203
1177int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 1204int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1178{ 1205{
1179 return lock_extent_bits(tree, start, end, 0, NULL, mask); 1206 return lock_extent_bits(tree, start, end, 0, NULL);
1180} 1207}
1181 1208
1182int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1209int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1183 gfp_t mask)
1184{ 1210{
1185 int err; 1211 int err;
1186 u64 failed_start; 1212 u64 failed_start;
1187 1213
1188 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1214 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1189 &failed_start, NULL, mask); 1215 &failed_start, NULL, GFP_NOFS);
1190 if (err == -EEXIST) { 1216 if (err == -EEXIST) {
1191 if (failed_start > start) 1217 if (failed_start > start)
1192 clear_extent_bit(tree, start, failed_start - 1, 1218 clear_extent_bit(tree, start, failed_start - 1,
1193 EXTENT_LOCKED, 1, 0, NULL, mask); 1219 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
1194 return 0; 1220 return 0;
1195 } 1221 }
1196 return 1; 1222 return 1;
@@ -1203,10 +1229,10 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1203 mask); 1229 mask);
1204} 1230}
1205 1231
1206int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 1232int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1207{ 1233{
1208 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1234 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1209 mask); 1235 GFP_NOFS);
1210} 1236}
1211 1237
1212/* 1238/*
@@ -1220,7 +1246,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1220 1246
1221 while (index <= end_index) { 1247 while (index <= end_index) {
1222 page = find_get_page(tree->mapping, index); 1248 page = find_get_page(tree->mapping, index);
1223 BUG_ON(!page); 1249 BUG_ON(!page); /* Pages should be in the extent_io_tree */
1224 set_page_writeback(page); 1250 set_page_writeback(page);
1225 page_cache_release(page); 1251 page_cache_release(page);
1226 index++; 1252 index++;
@@ -1343,9 +1369,9 @@ out:
1343 return found; 1369 return found;
1344} 1370}
1345 1371
1346static noinline int __unlock_for_delalloc(struct inode *inode, 1372static noinline void __unlock_for_delalloc(struct inode *inode,
1347 struct page *locked_page, 1373 struct page *locked_page,
1348 u64 start, u64 end) 1374 u64 start, u64 end)
1349{ 1375{
1350 int ret; 1376 int ret;
1351 struct page *pages[16]; 1377 struct page *pages[16];
@@ -1355,7 +1381,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
1355 int i; 1381 int i;
1356 1382
1357 if (index == locked_page->index && end_index == index) 1383 if (index == locked_page->index && end_index == index)
1358 return 0; 1384 return;
1359 1385
1360 while (nr_pages > 0) { 1386 while (nr_pages > 0) {
1361 ret = find_get_pages_contig(inode->i_mapping, index, 1387 ret = find_get_pages_contig(inode->i_mapping, index,
@@ -1370,7 +1396,6 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
1370 index += ret; 1396 index += ret;
1371 cond_resched(); 1397 cond_resched();
1372 } 1398 }
1373 return 0;
1374} 1399}
1375 1400
1376static noinline int lock_delalloc_pages(struct inode *inode, 1401static noinline int lock_delalloc_pages(struct inode *inode,
@@ -1500,11 +1525,10 @@ again:
1500 goto out_failed; 1525 goto out_failed;
1501 } 1526 }
1502 } 1527 }
1503 BUG_ON(ret); 1528 BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
1504 1529
1505 /* step three, lock the state bits for the whole range */ 1530 /* step three, lock the state bits for the whole range */
1506 lock_extent_bits(tree, delalloc_start, delalloc_end, 1531 lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
1507 0, &cached_state, GFP_NOFS);
1508 1532
1509 /* then test to make sure it is all still delalloc */ 1533 /* then test to make sure it is all still delalloc */
1510 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1534 ret = test_range_bit(tree, delalloc_start, delalloc_end,
@@ -1761,39 +1785,34 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1761 * helper function to set a given page up to date if all the 1785 * helper function to set a given page up to date if all the
1762 * extents in the tree for that page are up to date 1786 * extents in the tree for that page are up to date
1763 */ 1787 */
1764static int check_page_uptodate(struct extent_io_tree *tree, 1788static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1765 struct page *page)
1766{ 1789{
1767 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1790 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1768 u64 end = start + PAGE_CACHE_SIZE - 1; 1791 u64 end = start + PAGE_CACHE_SIZE - 1;
1769 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1792 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1770 SetPageUptodate(page); 1793 SetPageUptodate(page);
1771 return 0;
1772} 1794}
1773 1795
1774/* 1796/*
1775 * helper function to unlock a page if all the extents in the tree 1797 * helper function to unlock a page if all the extents in the tree
1776 * for that page are unlocked 1798 * for that page are unlocked
1777 */ 1799 */
1778static int check_page_locked(struct extent_io_tree *tree, 1800static void check_page_locked(struct extent_io_tree *tree, struct page *page)
1779 struct page *page)
1780{ 1801{
1781 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1802 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1782 u64 end = start + PAGE_CACHE_SIZE - 1; 1803 u64 end = start + PAGE_CACHE_SIZE - 1;
1783 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1804 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1784 unlock_page(page); 1805 unlock_page(page);
1785 return 0;
1786} 1806}
1787 1807
1788/* 1808/*
1789 * helper function to end page writeback if all the extents 1809 * helper function to end page writeback if all the extents
1790 * in the tree for that page are done with writeback 1810 * in the tree for that page are done with writeback
1791 */ 1811 */
1792static int check_page_writeback(struct extent_io_tree *tree, 1812static void check_page_writeback(struct extent_io_tree *tree,
1793 struct page *page) 1813 struct page *page)
1794{ 1814{
1795 end_page_writeback(page); 1815 end_page_writeback(page);
1796 return 0;
1797} 1816}
1798 1817
1799/* 1818/*
@@ -1912,6 +1931,26 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1912 return 0; 1931 return 0;
1913} 1932}
1914 1933
1934int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
1935 int mirror_num)
1936{
1937 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
1938 u64 start = eb->start;
1939 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
1940 int ret;
1941
1942 for (i = 0; i < num_pages; i++) {
1943 struct page *p = extent_buffer_page(eb, i);
1944 ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
1945 start, p, mirror_num);
1946 if (ret)
1947 break;
1948 start += PAGE_CACHE_SIZE;
1949 }
1950
1951 return ret;
1952}
1953
1915/* 1954/*
1916 * each time an IO finishes, we do a fast check in the IO failure tree 1955 * each time an IO finishes, we do a fast check in the IO failure tree
1917 * to see if we need to process or clean up an io_failure_record 1956 * to see if we need to process or clean up an io_failure_record
@@ -2258,6 +2297,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2258 u64 start; 2297 u64 start;
2259 u64 end; 2298 u64 end;
2260 int whole_page; 2299 int whole_page;
2300 int failed_mirror;
2261 int ret; 2301 int ret;
2262 2302
2263 if (err) 2303 if (err)
@@ -2304,9 +2344,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2304 else 2344 else
2305 clean_io_failure(start, page); 2345 clean_io_failure(start, page);
2306 } 2346 }
2307 if (!uptodate) { 2347
2308 int failed_mirror; 2348 if (!uptodate)
2309 failed_mirror = (int)(unsigned long)bio->bi_bdev; 2349 failed_mirror = (int)(unsigned long)bio->bi_bdev;
2350
2351 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
2352 ret = tree->ops->readpage_io_failed_hook(page, failed_mirror);
2353 if (!ret && !err &&
2354 test_bit(BIO_UPTODATE, &bio->bi_flags))
2355 uptodate = 1;
2356 } else if (!uptodate) {
2310 /* 2357 /*
2311 * The generic bio_readpage_error handles errors the 2358 * The generic bio_readpage_error handles errors the
2312 * following way: If possible, new read requests are 2359 * following way: If possible, new read requests are
@@ -2320,7 +2367,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2320 ret = bio_readpage_error(bio, page, start, end, 2367 ret = bio_readpage_error(bio, page, start, end,
2321 failed_mirror, NULL); 2368 failed_mirror, NULL);
2322 if (ret == 0) { 2369 if (ret == 0) {
2323error_handled:
2324 uptodate = 2370 uptodate =
2325 test_bit(BIO_UPTODATE, &bio->bi_flags); 2371 test_bit(BIO_UPTODATE, &bio->bi_flags);
2326 if (err) 2372 if (err)
@@ -2328,16 +2374,9 @@ error_handled:
2328 uncache_state(&cached); 2374 uncache_state(&cached);
2329 continue; 2375 continue;
2330 } 2376 }
2331 if (tree->ops && tree->ops->readpage_io_failed_hook) {
2332 ret = tree->ops->readpage_io_failed_hook(
2333 bio, page, start, end,
2334 failed_mirror, state);
2335 if (ret == 0)
2336 goto error_handled;
2337 }
2338 } 2377 }
2339 2378
2340 if (uptodate) { 2379 if (uptodate && tree->track_uptodate) {
2341 set_extent_uptodate(tree, start, end, &cached, 2380 set_extent_uptodate(tree, start, end, &cached,
2342 GFP_ATOMIC); 2381 GFP_ATOMIC);
2343 } 2382 }
@@ -2386,8 +2425,12 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2386 return bio; 2425 return bio;
2387} 2426}
2388 2427
2389static int submit_one_bio(int rw, struct bio *bio, int mirror_num, 2428/*
2390 unsigned long bio_flags) 2429 * Since writes are async, they will only return -ENOMEM.
2430 * Reads can return the full range of I/O error conditions.
2431 */
2432static int __must_check submit_one_bio(int rw, struct bio *bio,
2433 int mirror_num, unsigned long bio_flags)
2391{ 2434{
2392 int ret = 0; 2435 int ret = 0;
2393 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2436 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -2413,6 +2456,19 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
2413 return ret; 2456 return ret;
2414} 2457}
2415 2458
2459static int merge_bio(struct extent_io_tree *tree, struct page *page,
2460 unsigned long offset, size_t size, struct bio *bio,
2461 unsigned long bio_flags)
2462{
2463 int ret = 0;
2464 if (tree->ops && tree->ops->merge_bio_hook)
2465 ret = tree->ops->merge_bio_hook(page, offset, size, bio,
2466 bio_flags);
2467 BUG_ON(ret < 0);
2468 return ret;
2469
2470}
2471
2416static int submit_extent_page(int rw, struct extent_io_tree *tree, 2472static int submit_extent_page(int rw, struct extent_io_tree *tree,
2417 struct page *page, sector_t sector, 2473 struct page *page, sector_t sector,
2418 size_t size, unsigned long offset, 2474 size_t size, unsigned long offset,
@@ -2441,12 +2497,12 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
2441 sector; 2497 sector;
2442 2498
2443 if (prev_bio_flags != bio_flags || !contig || 2499 if (prev_bio_flags != bio_flags || !contig ||
2444 (tree->ops && tree->ops->merge_bio_hook && 2500 merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
2445 tree->ops->merge_bio_hook(page, offset, page_size, bio,
2446 bio_flags)) ||
2447 bio_add_page(bio, page, page_size, offset) < page_size) { 2501 bio_add_page(bio, page, page_size, offset) < page_size) {
2448 ret = submit_one_bio(rw, bio, mirror_num, 2502 ret = submit_one_bio(rw, bio, mirror_num,
2449 prev_bio_flags); 2503 prev_bio_flags);
2504 if (ret < 0)
2505 return ret;
2450 bio = NULL; 2506 bio = NULL;
2451 } else { 2507 } else {
2452 return 0; 2508 return 0;
@@ -2473,25 +2529,31 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
2473 return ret; 2529 return ret;
2474} 2530}
2475 2531
2476void set_page_extent_mapped(struct page *page) 2532void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
2477{ 2533{
2478 if (!PagePrivate(page)) { 2534 if (!PagePrivate(page)) {
2479 SetPagePrivate(page); 2535 SetPagePrivate(page);
2480 page_cache_get(page); 2536 page_cache_get(page);
2481 set_page_private(page, EXTENT_PAGE_PRIVATE); 2537 set_page_private(page, (unsigned long)eb);
2538 } else {
2539 WARN_ON(page->private != (unsigned long)eb);
2482 } 2540 }
2483} 2541}
2484 2542
2485static void set_page_extent_head(struct page *page, unsigned long len) 2543void set_page_extent_mapped(struct page *page)
2486{ 2544{
2487 WARN_ON(!PagePrivate(page)); 2545 if (!PagePrivate(page)) {
2488 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); 2546 SetPagePrivate(page);
2547 page_cache_get(page);
2548 set_page_private(page, EXTENT_PAGE_PRIVATE);
2549 }
2489} 2550}
2490 2551
2491/* 2552/*
2492 * basic readpage implementation. Locked extent state structs are inserted 2553 * basic readpage implementation. Locked extent state structs are inserted
2493 * into the tree that are removed when the IO is done (by the end_io 2554 * into the tree that are removed when the IO is done (by the end_io
2494 * handlers) 2555 * handlers)
2556 * XXX JDM: This needs looking at to ensure proper page locking
2495 */ 2557 */
2496static int __extent_read_full_page(struct extent_io_tree *tree, 2558static int __extent_read_full_page(struct extent_io_tree *tree,
2497 struct page *page, 2559 struct page *page,
@@ -2531,11 +2593,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2531 2593
2532 end = page_end; 2594 end = page_end;
2533 while (1) { 2595 while (1) {
2534 lock_extent(tree, start, end, GFP_NOFS); 2596 lock_extent(tree, start, end);
2535 ordered = btrfs_lookup_ordered_extent(inode, start); 2597 ordered = btrfs_lookup_ordered_extent(inode, start);
2536 if (!ordered) 2598 if (!ordered)
2537 break; 2599 break;
2538 unlock_extent(tree, start, end, GFP_NOFS); 2600 unlock_extent(tree, start, end);
2539 btrfs_start_ordered_extent(inode, ordered, 1); 2601 btrfs_start_ordered_extent(inode, ordered, 1);
2540 btrfs_put_ordered_extent(ordered); 2602 btrfs_put_ordered_extent(ordered);
2541 } 2603 }
@@ -2572,7 +2634,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2572 end - cur + 1, 0); 2634 end - cur + 1, 0);
2573 if (IS_ERR_OR_NULL(em)) { 2635 if (IS_ERR_OR_NULL(em)) {
2574 SetPageError(page); 2636 SetPageError(page);
2575 unlock_extent(tree, cur, end, GFP_NOFS); 2637 unlock_extent(tree, cur, end);
2576 break; 2638 break;
2577 } 2639 }
2578 extent_offset = cur - em->start; 2640 extent_offset = cur - em->start;
@@ -2624,7 +2686,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2624 if (test_range_bit(tree, cur, cur_end, 2686 if (test_range_bit(tree, cur, cur_end,
2625 EXTENT_UPTODATE, 1, NULL)) { 2687 EXTENT_UPTODATE, 1, NULL)) {
2626 check_page_uptodate(tree, page); 2688 check_page_uptodate(tree, page);
2627 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2689 unlock_extent(tree, cur, cur + iosize - 1);
2628 cur = cur + iosize; 2690 cur = cur + iosize;
2629 pg_offset += iosize; 2691 pg_offset += iosize;
2630 continue; 2692 continue;
@@ -2634,7 +2696,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2634 */ 2696 */
2635 if (block_start == EXTENT_MAP_INLINE) { 2697 if (block_start == EXTENT_MAP_INLINE) {
2636 SetPageError(page); 2698 SetPageError(page);
2637 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2699 unlock_extent(tree, cur, cur + iosize - 1);
2638 cur = cur + iosize; 2700 cur = cur + iosize;
2639 pg_offset += iosize; 2701 pg_offset += iosize;
2640 continue; 2702 continue;
@@ -2654,6 +2716,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2654 end_bio_extent_readpage, mirror_num, 2716 end_bio_extent_readpage, mirror_num,
2655 *bio_flags, 2717 *bio_flags,
2656 this_bio_flag); 2718 this_bio_flag);
2719 BUG_ON(ret == -ENOMEM);
2657 nr++; 2720 nr++;
2658 *bio_flags = this_bio_flag; 2721 *bio_flags = this_bio_flag;
2659 } 2722 }
@@ -2795,7 +2858,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2795 delalloc_end, 2858 delalloc_end,
2796 &page_started, 2859 &page_started,
2797 &nr_written); 2860 &nr_written);
2798 BUG_ON(ret); 2861 /* File system has been set read-only */
2862 if (ret) {
2863 SetPageError(page);
2864 goto done;
2865 }
2799 /* 2866 /*
2800 * delalloc_end is already one less than the total 2867 * delalloc_end is already one less than the total
2801 * length, so we don't subtract one from 2868 * length, so we don't subtract one from
@@ -2968,6 +3035,275 @@ done_unlocked:
2968 return 0; 3035 return 0;
2969} 3036}
2970 3037
3038static int eb_wait(void *word)
3039{
3040 io_schedule();
3041 return 0;
3042}
3043
3044static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3045{
3046 wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
3047 TASK_UNINTERRUPTIBLE);
3048}
3049
3050static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3051 struct btrfs_fs_info *fs_info,
3052 struct extent_page_data *epd)
3053{
3054 unsigned long i, num_pages;
3055 int flush = 0;
3056 int ret = 0;
3057
3058 if (!btrfs_try_tree_write_lock(eb)) {
3059 flush = 1;
3060 flush_write_bio(epd);
3061 btrfs_tree_lock(eb);
3062 }
3063
3064 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3065 btrfs_tree_unlock(eb);
3066 if (!epd->sync_io)
3067 return 0;
3068 if (!flush) {
3069 flush_write_bio(epd);
3070 flush = 1;
3071 }
3072 while (1) {
3073 wait_on_extent_buffer_writeback(eb);
3074 btrfs_tree_lock(eb);
3075 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3076 break;
3077 btrfs_tree_unlock(eb);
3078 }
3079 }
3080
3081 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3082 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3083 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3084 spin_lock(&fs_info->delalloc_lock);
3085 if (fs_info->dirty_metadata_bytes >= eb->len)
3086 fs_info->dirty_metadata_bytes -= eb->len;
3087 else
3088 WARN_ON(1);
3089 spin_unlock(&fs_info->delalloc_lock);
3090 ret = 1;
3091 }
3092
3093 btrfs_tree_unlock(eb);
3094
3095 if (!ret)
3096 return ret;
3097
3098 num_pages = num_extent_pages(eb->start, eb->len);
3099 for (i = 0; i < num_pages; i++) {
3100 struct page *p = extent_buffer_page(eb, i);
3101
3102 if (!trylock_page(p)) {
3103 if (!flush) {
3104 flush_write_bio(epd);
3105 flush = 1;
3106 }
3107 lock_page(p);
3108 }
3109 }
3110
3111 return ret;
3112}
3113
3114static void end_extent_buffer_writeback(struct extent_buffer *eb)
3115{
3116 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3117 smp_mb__after_clear_bit();
3118 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3119}
3120
3121static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3122{
3123 int uptodate = err == 0;
3124 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
3125 struct extent_buffer *eb;
3126 int done;
3127
3128 do {
3129 struct page *page = bvec->bv_page;
3130
3131 bvec--;
3132 eb = (struct extent_buffer *)page->private;
3133 BUG_ON(!eb);
3134 done = atomic_dec_and_test(&eb->io_pages);
3135
3136 if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
3137 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3138 ClearPageUptodate(page);
3139 SetPageError(page);
3140 }
3141
3142 end_page_writeback(page);
3143
3144 if (!done)
3145 continue;
3146
3147 end_extent_buffer_writeback(eb);
3148 } while (bvec >= bio->bi_io_vec);
3149
3150 bio_put(bio);
3151
3152}
3153
3154static int write_one_eb(struct extent_buffer *eb,
3155 struct btrfs_fs_info *fs_info,
3156 struct writeback_control *wbc,
3157 struct extent_page_data *epd)
3158{
3159 struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3160 u64 offset = eb->start;
3161 unsigned long i, num_pages;
3162 int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
3163 int ret;
3164
3165 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3166 num_pages = num_extent_pages(eb->start, eb->len);
3167 atomic_set(&eb->io_pages, num_pages);
3168 for (i = 0; i < num_pages; i++) {
3169 struct page *p = extent_buffer_page(eb, i);
3170
3171 clear_page_dirty_for_io(p);
3172 set_page_writeback(p);
3173 ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
3174 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3175 -1, end_bio_extent_buffer_writepage,
3176 0, 0, 0);
3177 if (ret) {
3178 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3179 SetPageError(p);
3180 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3181 end_extent_buffer_writeback(eb);
3182 ret = -EIO;
3183 break;
3184 }
3185 offset += PAGE_CACHE_SIZE;
3186 update_nr_written(p, wbc, 1);
3187 unlock_page(p);
3188 }
3189
3190 if (unlikely(ret)) {
3191 for (; i < num_pages; i++) {
3192 struct page *p = extent_buffer_page(eb, i);
3193 unlock_page(p);
3194 }
3195 }
3196
3197 return ret;
3198}
3199
3200int btree_write_cache_pages(struct address_space *mapping,
3201 struct writeback_control *wbc)
3202{
3203 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
3204 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
3205 struct extent_buffer *eb, *prev_eb = NULL;
3206 struct extent_page_data epd = {
3207 .bio = NULL,
3208 .tree = tree,
3209 .extent_locked = 0,
3210 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3211 };
3212 int ret = 0;
3213 int done = 0;
3214 int nr_to_write_done = 0;
3215 struct pagevec pvec;
3216 int nr_pages;
3217 pgoff_t index;
3218 pgoff_t end; /* Inclusive */
3219 int scanned = 0;
3220 int tag;
3221
3222 pagevec_init(&pvec, 0);
3223 if (wbc->range_cyclic) {
3224 index = mapping->writeback_index; /* Start from prev offset */
3225 end = -1;
3226 } else {
3227 index = wbc->range_start >> PAGE_CACHE_SHIFT;
3228 end = wbc->range_end >> PAGE_CACHE_SHIFT;
3229 scanned = 1;
3230 }
3231 if (wbc->sync_mode == WB_SYNC_ALL)
3232 tag = PAGECACHE_TAG_TOWRITE;
3233 else
3234 tag = PAGECACHE_TAG_DIRTY;
3235retry:
3236 if (wbc->sync_mode == WB_SYNC_ALL)
3237 tag_pages_for_writeback(mapping, index, end);
3238 while (!done && !nr_to_write_done && (index <= end) &&
3239 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3240 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3241 unsigned i;
3242
3243 scanned = 1;
3244 for (i = 0; i < nr_pages; i++) {
3245 struct page *page = pvec.pages[i];
3246
3247 if (!PagePrivate(page))
3248 continue;
3249
3250 if (!wbc->range_cyclic && page->index > end) {
3251 done = 1;
3252 break;
3253 }
3254
3255 eb = (struct extent_buffer *)page->private;
3256 if (!eb) {
3257 WARN_ON(1);
3258 continue;
3259 }
3260
3261 if (eb == prev_eb)
3262 continue;
3263
3264 if (!atomic_inc_not_zero(&eb->refs)) {
3265 WARN_ON(1);
3266 continue;
3267 }
3268
3269 prev_eb = eb;
3270 ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
3271 if (!ret) {
3272 free_extent_buffer(eb);
3273 continue;
3274 }
3275
3276 ret = write_one_eb(eb, fs_info, wbc, &epd);
3277 if (ret) {
3278 done = 1;
3279 free_extent_buffer(eb);
3280 break;
3281 }
3282 free_extent_buffer(eb);
3283
3284 /*
3285 * the filesystem may choose to bump up nr_to_write.
3286 * We have to make sure to honor the new nr_to_write
3287 * at any time
3288 */
3289 nr_to_write_done = wbc->nr_to_write <= 0;
3290 }
3291 pagevec_release(&pvec);
3292 cond_resched();
3293 }
3294 if (!scanned && !done) {
3295 /*
3296 * We hit the last page and there is more work to be done: wrap
3297 * back to the start of the file
3298 */
3299 scanned = 1;
3300 index = 0;
3301 goto retry;
3302 }
3303 flush_write_bio(&epd);
3304 return ret;
3305}
3306
2971/** 3307/**
2972 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 3308 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2973 * @mapping: address space structure to write 3309 * @mapping: address space structure to write
@@ -3099,10 +3435,14 @@ retry:
3099static void flush_epd_write_bio(struct extent_page_data *epd) 3435static void flush_epd_write_bio(struct extent_page_data *epd)
3100{ 3436{
3101 if (epd->bio) { 3437 if (epd->bio) {
3438 int rw = WRITE;
3439 int ret;
3440
3102 if (epd->sync_io) 3441 if (epd->sync_io)
3103 submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); 3442 rw = WRITE_SYNC;
3104 else 3443
3105 submit_one_bio(WRITE, epd->bio, 0, 0); 3444 ret = submit_one_bio(rw, epd->bio, 0, 0);
3445 BUG_ON(ret < 0); /* -ENOMEM */
3106 epd->bio = NULL; 3446 epd->bio = NULL;
3107 } 3447 }
3108} 3448}
@@ -3219,7 +3559,7 @@ int extent_readpages(struct extent_io_tree *tree,
3219 } 3559 }
3220 BUG_ON(!list_empty(pages)); 3560 BUG_ON(!list_empty(pages));
3221 if (bio) 3561 if (bio)
3222 submit_one_bio(READ, bio, 0, bio_flags); 3562 return submit_one_bio(READ, bio, 0, bio_flags);
3223 return 0; 3563 return 0;
3224} 3564}
3225 3565
@@ -3240,7 +3580,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
3240 if (start > end) 3580 if (start > end)
3241 return 0; 3581 return 0;
3242 3582
3243 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); 3583 lock_extent_bits(tree, start, end, 0, &cached_state);
3244 wait_on_page_writeback(page); 3584 wait_on_page_writeback(page);
3245 clear_extent_bit(tree, start, end, 3585 clear_extent_bit(tree, start, end,
3246 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 3586 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@ -3454,7 +3794,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3454 } 3794 }
3455 3795
3456 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3796 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3457 &cached_state, GFP_NOFS); 3797 &cached_state);
3458 3798
3459 em = get_extent_skip_holes(inode, start, last_for_get_extent, 3799 em = get_extent_skip_holes(inode, start, last_for_get_extent,
3460 get_extent); 3800 get_extent);
@@ -3548,26 +3888,7 @@ out:
3548inline struct page *extent_buffer_page(struct extent_buffer *eb, 3888inline struct page *extent_buffer_page(struct extent_buffer *eb,
3549 unsigned long i) 3889 unsigned long i)
3550{ 3890{
3551 struct page *p; 3891 return eb->pages[i];
3552 struct address_space *mapping;
3553
3554 if (i == 0)
3555 return eb->first_page;
3556 i += eb->start >> PAGE_CACHE_SHIFT;
3557 mapping = eb->first_page->mapping;
3558 if (!mapping)
3559 return NULL;
3560
3561 /*
3562 * extent_buffer_page is only called after pinning the page
3563 * by increasing the reference count. So we know the page must
3564 * be in the radix tree.
3565 */
3566 rcu_read_lock();
3567 p = radix_tree_lookup(&mapping->page_tree, i);
3568 rcu_read_unlock();
3569
3570 return p;
3571} 3892}
3572 3893
3573inline unsigned long num_extent_pages(u64 start, u64 len) 3894inline unsigned long num_extent_pages(u64 start, u64 len)
@@ -3576,6 +3897,19 @@ inline unsigned long num_extent_pages(u64 start, u64 len)
3576 (start >> PAGE_CACHE_SHIFT); 3897 (start >> PAGE_CACHE_SHIFT);
3577} 3898}
3578 3899
3900static void __free_extent_buffer(struct extent_buffer *eb)
3901{
3902#if LEAK_DEBUG
3903 unsigned long flags;
3904 spin_lock_irqsave(&leak_lock, flags);
3905 list_del(&eb->leak_list);
3906 spin_unlock_irqrestore(&leak_lock, flags);
3907#endif
3908 if (eb->pages && eb->pages != eb->inline_pages)
3909 kfree(eb->pages);
3910 kmem_cache_free(extent_buffer_cache, eb);
3911}
3912
3579static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 3913static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3580 u64 start, 3914 u64 start,
3581 unsigned long len, 3915 unsigned long len,
@@ -3591,6 +3925,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3591 return NULL; 3925 return NULL;
3592 eb->start = start; 3926 eb->start = start;
3593 eb->len = len; 3927 eb->len = len;
3928 eb->tree = tree;
3594 rwlock_init(&eb->lock); 3929 rwlock_init(&eb->lock);
3595 atomic_set(&eb->write_locks, 0); 3930 atomic_set(&eb->write_locks, 0);
3596 atomic_set(&eb->read_locks, 0); 3931 atomic_set(&eb->read_locks, 0);
@@ -3607,20 +3942,32 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3607 list_add(&eb->leak_list, &buffers); 3942 list_add(&eb->leak_list, &buffers);
3608 spin_unlock_irqrestore(&leak_lock, flags); 3943 spin_unlock_irqrestore(&leak_lock, flags);
3609#endif 3944#endif
3945 spin_lock_init(&eb->refs_lock);
3610 atomic_set(&eb->refs, 1); 3946 atomic_set(&eb->refs, 1);
3947 atomic_set(&eb->io_pages, 0);
3948
3949 if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
3950 struct page **pages;
3951 int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
3952 PAGE_CACHE_SHIFT;
3953 pages = kzalloc(num_pages, mask);
3954 if (!pages) {
3955 __free_extent_buffer(eb);
3956 return NULL;
3957 }
3958 eb->pages = pages;
3959 } else {
3960 eb->pages = eb->inline_pages;
3961 }
3611 3962
3612 return eb; 3963 return eb;
3613} 3964}
3614 3965
3615static void __free_extent_buffer(struct extent_buffer *eb) 3966static int extent_buffer_under_io(struct extent_buffer *eb)
3616{ 3967{
3617#if LEAK_DEBUG 3968 return (atomic_read(&eb->io_pages) ||
3618 unsigned long flags; 3969 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
3619 spin_lock_irqsave(&leak_lock, flags); 3970 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
3620 list_del(&eb->leak_list);
3621 spin_unlock_irqrestore(&leak_lock, flags);
3622#endif
3623 kmem_cache_free(extent_buffer_cache, eb);
3624} 3971}
3625 3972
3626/* 3973/*
@@ -3632,8 +3979,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3632 unsigned long index; 3979 unsigned long index;
3633 struct page *page; 3980 struct page *page;
3634 3981
3635 if (!eb->first_page) 3982 BUG_ON(extent_buffer_under_io(eb));
3636 return;
3637 3983
3638 index = num_extent_pages(eb->start, eb->len); 3984 index = num_extent_pages(eb->start, eb->len);
3639 if (start_idx >= index) 3985 if (start_idx >= index)
@@ -3642,8 +3988,34 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3642 do { 3988 do {
3643 index--; 3989 index--;
3644 page = extent_buffer_page(eb, index); 3990 page = extent_buffer_page(eb, index);
3645 if (page) 3991 if (page) {
3992 spin_lock(&page->mapping->private_lock);
3993 /*
3994 * We do this since we'll remove the pages after we've
3995 * removed the eb from the radix tree, so we could race
3996 * and have this page now attached to the new eb. So
3997 * only clear page_private if it's still connected to
3998 * this eb.
3999 */
4000 if (PagePrivate(page) &&
4001 page->private == (unsigned long)eb) {
4002 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4003 BUG_ON(PageDirty(page));
4004 BUG_ON(PageWriteback(page));
4005 /*
4006 * We need to make sure we haven't be attached
4007 * to a new eb.
4008 */
4009 ClearPagePrivate(page);
4010 set_page_private(page, 0);
4011 /* One for the page private */
4012 page_cache_release(page);
4013 }
4014 spin_unlock(&page->mapping->private_lock);
4015
4016 /* One for when we alloced the page */
3646 page_cache_release(page); 4017 page_cache_release(page);
4018 }
3647 } while (index != start_idx); 4019 } while (index != start_idx);
3648} 4020}
3649 4021
@@ -3656,9 +4028,50 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3656 __free_extent_buffer(eb); 4028 __free_extent_buffer(eb);
3657} 4029}
3658 4030
4031static void check_buffer_tree_ref(struct extent_buffer *eb)
4032{
4033 /* the ref bit is tricky. We have to make sure it is set
4034 * if we have the buffer dirty. Otherwise the
4035 * code to free a buffer can end up dropping a dirty
4036 * page
4037 *
4038 * Once the ref bit is set, it won't go away while the
4039 * buffer is dirty or in writeback, and it also won't
4040 * go away while we have the reference count on the
4041 * eb bumped.
4042 *
4043 * We can't just set the ref bit without bumping the
4044 * ref on the eb because free_extent_buffer might
4045 * see the ref bit and try to clear it. If this happens
4046 * free_extent_buffer might end up dropping our original
4047 * ref by mistake and freeing the page before we are able
4048 * to add one more ref.
4049 *
4050 * So bump the ref count first, then set the bit. If someone
4051 * beat us to it, drop the ref we added.
4052 */
4053 if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
4054 atomic_inc(&eb->refs);
4055 if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4056 atomic_dec(&eb->refs);
4057 }
4058}
4059
4060static void mark_extent_buffer_accessed(struct extent_buffer *eb)
4061{
4062 unsigned long num_pages, i;
4063
4064 check_buffer_tree_ref(eb);
4065
4066 num_pages = num_extent_pages(eb->start, eb->len);
4067 for (i = 0; i < num_pages; i++) {
4068 struct page *p = extent_buffer_page(eb, i);
4069 mark_page_accessed(p);
4070 }
4071}
4072
3659struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 4073struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3660 u64 start, unsigned long len, 4074 u64 start, unsigned long len)
3661 struct page *page0)
3662{ 4075{
3663 unsigned long num_pages = num_extent_pages(start, len); 4076 unsigned long num_pages = num_extent_pages(start, len);
3664 unsigned long i; 4077 unsigned long i;
@@ -3674,7 +4087,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3674 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4087 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3675 if (eb && atomic_inc_not_zero(&eb->refs)) { 4088 if (eb && atomic_inc_not_zero(&eb->refs)) {
3676 rcu_read_unlock(); 4089 rcu_read_unlock();
3677 mark_page_accessed(eb->first_page); 4090 mark_extent_buffer_accessed(eb);
3678 return eb; 4091 return eb;
3679 } 4092 }
3680 rcu_read_unlock(); 4093 rcu_read_unlock();
@@ -3683,32 +4096,43 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3683 if (!eb) 4096 if (!eb)
3684 return NULL; 4097 return NULL;
3685 4098
3686 if (page0) { 4099 for (i = 0; i < num_pages; i++, index++) {
3687 eb->first_page = page0;
3688 i = 1;
3689 index++;
3690 page_cache_get(page0);
3691 mark_page_accessed(page0);
3692 set_page_extent_mapped(page0);
3693 set_page_extent_head(page0, len);
3694 uptodate = PageUptodate(page0);
3695 } else {
3696 i = 0;
3697 }
3698 for (; i < num_pages; i++, index++) {
3699 p = find_or_create_page(mapping, index, GFP_NOFS); 4100 p = find_or_create_page(mapping, index, GFP_NOFS);
3700 if (!p) { 4101 if (!p) {
3701 WARN_ON(1); 4102 WARN_ON(1);
3702 goto free_eb; 4103 goto free_eb;
3703 } 4104 }
3704 set_page_extent_mapped(p); 4105
3705 mark_page_accessed(p); 4106 spin_lock(&mapping->private_lock);
3706 if (i == 0) { 4107 if (PagePrivate(p)) {
3707 eb->first_page = p; 4108 /*
3708 set_page_extent_head(p, len); 4109 * We could have already allocated an eb for this page
3709 } else { 4110 * and attached one so lets see if we can get a ref on
3710 set_page_private(p, EXTENT_PAGE_PRIVATE); 4111 * the existing eb, and if we can we know it's good and
4112 * we can just return that one, else we know we can just
4113 * overwrite page->private.
4114 */
4115 exists = (struct extent_buffer *)p->private;
4116 if (atomic_inc_not_zero(&exists->refs)) {
4117 spin_unlock(&mapping->private_lock);
4118 unlock_page(p);
4119 mark_extent_buffer_accessed(exists);
4120 goto free_eb;
4121 }
4122
4123 /*
4124 * Do this so attach doesn't complain and we need to
4125 * drop the ref the old guy had.
4126 */
4127 ClearPagePrivate(p);
4128 WARN_ON(PageDirty(p));
4129 page_cache_release(p);
3711 } 4130 }
4131 attach_extent_buffer_page(eb, p);
4132 spin_unlock(&mapping->private_lock);
4133 WARN_ON(PageDirty(p));
4134 mark_page_accessed(p);
4135 eb->pages[i] = p;
3712 if (!PageUptodate(p)) 4136 if (!PageUptodate(p))
3713 uptodate = 0; 4137 uptodate = 0;
3714 4138
@@ -3716,12 +4140,10 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3716 * see below about how we avoid a nasty race with release page 4140 * see below about how we avoid a nasty race with release page
3717 * and why we unlock later 4141 * and why we unlock later
3718 */ 4142 */
3719 if (i != 0)
3720 unlock_page(p);
3721 } 4143 }
3722 if (uptodate) 4144 if (uptodate)
3723 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4145 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3724 4146again:
3725 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 4147 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
3726 if (ret) 4148 if (ret)
3727 goto free_eb; 4149 goto free_eb;
@@ -3731,14 +4153,21 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3731 if (ret == -EEXIST) { 4153 if (ret == -EEXIST) {
3732 exists = radix_tree_lookup(&tree->buffer, 4154 exists = radix_tree_lookup(&tree->buffer,
3733 start >> PAGE_CACHE_SHIFT); 4155 start >> PAGE_CACHE_SHIFT);
3734 /* add one reference for the caller */ 4156 if (!atomic_inc_not_zero(&exists->refs)) {
3735 atomic_inc(&exists->refs); 4157 spin_unlock(&tree->buffer_lock);
4158 radix_tree_preload_end();
4159 exists = NULL;
4160 goto again;
4161 }
3736 spin_unlock(&tree->buffer_lock); 4162 spin_unlock(&tree->buffer_lock);
3737 radix_tree_preload_end(); 4163 radix_tree_preload_end();
4164 mark_extent_buffer_accessed(exists);
3738 goto free_eb; 4165 goto free_eb;
3739 } 4166 }
3740 /* add one reference for the tree */ 4167 /* add one reference for the tree */
3741 atomic_inc(&eb->refs); 4168 spin_lock(&eb->refs_lock);
4169 check_buffer_tree_ref(eb);
4170 spin_unlock(&eb->refs_lock);
3742 spin_unlock(&tree->buffer_lock); 4171 spin_unlock(&tree->buffer_lock);
3743 radix_tree_preload_end(); 4172 radix_tree_preload_end();
3744 4173
@@ -3751,15 +4180,20 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3751 * after the extent buffer is in the radix tree so 4180 * after the extent buffer is in the radix tree so
3752 * it doesn't get lost 4181 * it doesn't get lost
3753 */ 4182 */
3754 set_page_extent_mapped(eb->first_page); 4183 SetPageChecked(eb->pages[0]);
3755 set_page_extent_head(eb->first_page, eb->len); 4184 for (i = 1; i < num_pages; i++) {
3756 if (!page0) 4185 p = extent_buffer_page(eb, i);
3757 unlock_page(eb->first_page); 4186 ClearPageChecked(p);
4187 unlock_page(p);
4188 }
4189 unlock_page(eb->pages[0]);
3758 return eb; 4190 return eb;
3759 4191
3760free_eb: 4192free_eb:
3761 if (eb->first_page && !page0) 4193 for (i = 0; i < num_pages; i++) {
3762 unlock_page(eb->first_page); 4194 if (eb->pages[i])
4195 unlock_page(eb->pages[i]);
4196 }
3763 4197
3764 if (!atomic_dec_and_test(&eb->refs)) 4198 if (!atomic_dec_and_test(&eb->refs))
3765 return exists; 4199 return exists;
@@ -3776,7 +4210,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3776 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4210 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3777 if (eb && atomic_inc_not_zero(&eb->refs)) { 4211 if (eb && atomic_inc_not_zero(&eb->refs)) {
3778 rcu_read_unlock(); 4212 rcu_read_unlock();
3779 mark_page_accessed(eb->first_page); 4213 mark_extent_buffer_accessed(eb);
3780 return eb; 4214 return eb;
3781 } 4215 }
3782 rcu_read_unlock(); 4216 rcu_read_unlock();
@@ -3784,19 +4218,71 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3784 return NULL; 4218 return NULL;
3785} 4219}
3786 4220
4221static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4222{
4223 struct extent_buffer *eb =
4224 container_of(head, struct extent_buffer, rcu_head);
4225
4226 __free_extent_buffer(eb);
4227}
4228
4229/* Expects to have eb->eb_lock already held */
4230static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4231{
4232 WARN_ON(atomic_read(&eb->refs) == 0);
4233 if (atomic_dec_and_test(&eb->refs)) {
4234 struct extent_io_tree *tree = eb->tree;
4235
4236 spin_unlock(&eb->refs_lock);
4237
4238 spin_lock(&tree->buffer_lock);
4239 radix_tree_delete(&tree->buffer,
4240 eb->start >> PAGE_CACHE_SHIFT);
4241 spin_unlock(&tree->buffer_lock);
4242
4243 /* Should be safe to release our pages at this point */
4244 btrfs_release_extent_buffer_page(eb, 0);
4245
4246 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4247 return;
4248 }
4249 spin_unlock(&eb->refs_lock);
4250}
4251
3787void free_extent_buffer(struct extent_buffer *eb) 4252void free_extent_buffer(struct extent_buffer *eb)
3788{ 4253{
3789 if (!eb) 4254 if (!eb)
3790 return; 4255 return;
3791 4256
3792 if (!atomic_dec_and_test(&eb->refs)) 4257 spin_lock(&eb->refs_lock);
4258 if (atomic_read(&eb->refs) == 2 &&
4259 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
4260 !extent_buffer_under_io(eb) &&
4261 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4262 atomic_dec(&eb->refs);
4263
4264 /*
4265 * I know this is terrible, but it's temporary until we stop tracking
4266 * the uptodate bits and such for the extent buffers.
4267 */
4268 release_extent_buffer(eb, GFP_ATOMIC);
4269}
4270
4271void free_extent_buffer_stale(struct extent_buffer *eb)
4272{
4273 if (!eb)
3793 return; 4274 return;
3794 4275
3795 WARN_ON(1); 4276 spin_lock(&eb->refs_lock);
4277 set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
4278
4279 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
4280 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4281 atomic_dec(&eb->refs);
4282 release_extent_buffer(eb, GFP_NOFS);
3796} 4283}
3797 4284
3798int clear_extent_buffer_dirty(struct extent_io_tree *tree, 4285void clear_extent_buffer_dirty(struct extent_buffer *eb)
3799 struct extent_buffer *eb)
3800{ 4286{
3801 unsigned long i; 4287 unsigned long i;
3802 unsigned long num_pages; 4288 unsigned long num_pages;
@@ -3812,10 +4298,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3812 lock_page(page); 4298 lock_page(page);
3813 WARN_ON(!PagePrivate(page)); 4299 WARN_ON(!PagePrivate(page));
3814 4300
3815 set_page_extent_mapped(page);
3816 if (i == 0)
3817 set_page_extent_head(page, eb->len);
3818
3819 clear_page_dirty_for_io(page); 4301 clear_page_dirty_for_io(page);
3820 spin_lock_irq(&page->mapping->tree_lock); 4302 spin_lock_irq(&page->mapping->tree_lock);
3821 if (!PageDirty(page)) { 4303 if (!PageDirty(page)) {
@@ -3827,24 +4309,29 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3827 ClearPageError(page); 4309 ClearPageError(page);
3828 unlock_page(page); 4310 unlock_page(page);
3829 } 4311 }
3830 return 0; 4312 WARN_ON(atomic_read(&eb->refs) == 0);
3831} 4313}
3832 4314
3833int set_extent_buffer_dirty(struct extent_io_tree *tree, 4315int set_extent_buffer_dirty(struct extent_buffer *eb)
3834 struct extent_buffer *eb)
3835{ 4316{
3836 unsigned long i; 4317 unsigned long i;
3837 unsigned long num_pages; 4318 unsigned long num_pages;
3838 int was_dirty = 0; 4319 int was_dirty = 0;
3839 4320
4321 check_buffer_tree_ref(eb);
4322
3840 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 4323 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
4324
3841 num_pages = num_extent_pages(eb->start, eb->len); 4325 num_pages = num_extent_pages(eb->start, eb->len);
4326 WARN_ON(atomic_read(&eb->refs) == 0);
4327 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
4328
3842 for (i = 0; i < num_pages; i++) 4329 for (i = 0; i < num_pages; i++)
3843 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 4330 set_page_dirty(extent_buffer_page(eb, i));
3844 return was_dirty; 4331 return was_dirty;
3845} 4332}
3846 4333
3847static int __eb_straddles_pages(u64 start, u64 len) 4334static int range_straddles_pages(u64 start, u64 len)
3848{ 4335{
3849 if (len < PAGE_CACHE_SIZE) 4336 if (len < PAGE_CACHE_SIZE)
3850 return 1; 4337 return 1;
@@ -3855,25 +4342,14 @@ static int __eb_straddles_pages(u64 start, u64 len)
3855 return 0; 4342 return 0;
3856} 4343}
3857 4344
3858static int eb_straddles_pages(struct extent_buffer *eb) 4345int clear_extent_buffer_uptodate(struct extent_buffer *eb)
3859{
3860 return __eb_straddles_pages(eb->start, eb->len);
3861}
3862
3863int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3864 struct extent_buffer *eb,
3865 struct extent_state **cached_state)
3866{ 4346{
3867 unsigned long i; 4347 unsigned long i;
3868 struct page *page; 4348 struct page *page;
3869 unsigned long num_pages; 4349 unsigned long num_pages;
3870 4350
3871 num_pages = num_extent_pages(eb->start, eb->len);
3872 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4351 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3873 4352 num_pages = num_extent_pages(eb->start, eb->len);
3874 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3875 cached_state, GFP_NOFS);
3876
3877 for (i = 0; i < num_pages; i++) { 4353 for (i = 0; i < num_pages; i++) {
3878 page = extent_buffer_page(eb, i); 4354 page = extent_buffer_page(eb, i);
3879 if (page) 4355 if (page)
@@ -3882,27 +4358,16 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3882 return 0; 4358 return 0;
3883} 4359}
3884 4360
3885int set_extent_buffer_uptodate(struct extent_io_tree *tree, 4361int set_extent_buffer_uptodate(struct extent_buffer *eb)
3886 struct extent_buffer *eb)
3887{ 4362{
3888 unsigned long i; 4363 unsigned long i;
3889 struct page *page; 4364 struct page *page;
3890 unsigned long num_pages; 4365 unsigned long num_pages;
3891 4366
4367 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3892 num_pages = num_extent_pages(eb->start, eb->len); 4368 num_pages = num_extent_pages(eb->start, eb->len);
3893
3894 if (eb_straddles_pages(eb)) {
3895 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3896 NULL, GFP_NOFS);
3897 }
3898 for (i = 0; i < num_pages; i++) { 4369 for (i = 0; i < num_pages; i++) {
3899 page = extent_buffer_page(eb, i); 4370 page = extent_buffer_page(eb, i);
3900 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3901 ((i == num_pages - 1) &&
3902 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3903 check_page_uptodate(tree, page);
3904 continue;
3905 }
3906 SetPageUptodate(page); 4371 SetPageUptodate(page);
3907 } 4372 }
3908 return 0; 4373 return 0;
@@ -3917,7 +4382,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3917 int uptodate; 4382 int uptodate;
3918 unsigned long index; 4383 unsigned long index;
3919 4384
3920 if (__eb_straddles_pages(start, end - start + 1)) { 4385 if (range_straddles_pages(start, end - start + 1)) {
3921 ret = test_range_bit(tree, start, end, 4386 ret = test_range_bit(tree, start, end,
3922 EXTENT_UPTODATE, 1, NULL); 4387 EXTENT_UPTODATE, 1, NULL);
3923 if (ret) 4388 if (ret)
@@ -3939,35 +4404,9 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3939 return pg_uptodate; 4404 return pg_uptodate;
3940} 4405}
3941 4406
3942int extent_buffer_uptodate(struct extent_io_tree *tree, 4407int extent_buffer_uptodate(struct extent_buffer *eb)
3943 struct extent_buffer *eb,
3944 struct extent_state *cached_state)
3945{ 4408{
3946 int ret = 0; 4409 return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3947 unsigned long num_pages;
3948 unsigned long i;
3949 struct page *page;
3950 int pg_uptodate = 1;
3951
3952 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3953 return 1;
3954
3955 if (eb_straddles_pages(eb)) {
3956 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3957 EXTENT_UPTODATE, 1, cached_state);
3958 if (ret)
3959 return ret;
3960 }
3961
3962 num_pages = num_extent_pages(eb->start, eb->len);
3963 for (i = 0; i < num_pages; i++) {
3964 page = extent_buffer_page(eb, i);
3965 if (!PageUptodate(page)) {
3966 pg_uptodate = 0;
3967 break;
3968 }
3969 }
3970 return pg_uptodate;
3971} 4410}
3972 4411
3973int read_extent_buffer_pages(struct extent_io_tree *tree, 4412int read_extent_buffer_pages(struct extent_io_tree *tree,
@@ -3981,21 +4420,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3981 int ret = 0; 4420 int ret = 0;
3982 int locked_pages = 0; 4421 int locked_pages = 0;
3983 int all_uptodate = 1; 4422 int all_uptodate = 1;
3984 int inc_all_pages = 0;
3985 unsigned long num_pages; 4423 unsigned long num_pages;
4424 unsigned long num_reads = 0;
3986 struct bio *bio = NULL; 4425 struct bio *bio = NULL;
3987 unsigned long bio_flags = 0; 4426 unsigned long bio_flags = 0;
3988 4427
3989 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 4428 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3990 return 0; 4429 return 0;
3991 4430
3992 if (eb_straddles_pages(eb)) {
3993 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3994 EXTENT_UPTODATE, 1, NULL)) {
3995 return 0;
3996 }
3997 }
3998
3999 if (start) { 4431 if (start) {
4000 WARN_ON(start < eb->start); 4432 WARN_ON(start < eb->start);
4001 start_i = (start >> PAGE_CACHE_SHIFT) - 4433 start_i = (start >> PAGE_CACHE_SHIFT) -
@@ -4014,8 +4446,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
4014 lock_page(page); 4446 lock_page(page);
4015 } 4447 }
4016 locked_pages++; 4448 locked_pages++;
4017 if (!PageUptodate(page)) 4449 if (!PageUptodate(page)) {
4450 num_reads++;
4018 all_uptodate = 0; 4451 all_uptodate = 0;
4452 }
4019 } 4453 }
4020 if (all_uptodate) { 4454 if (all_uptodate) {
4021 if (start_i == 0) 4455 if (start_i == 0)
@@ -4023,20 +4457,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
4023 goto unlock_exit; 4457 goto unlock_exit;
4024 } 4458 }
4025 4459
4460 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
4461 eb->failed_mirror = 0;
4462 atomic_set(&eb->io_pages, num_reads);
4026 for (i = start_i; i < num_pages; i++) { 4463 for (i = start_i; i < num_pages; i++) {
4027 page = extent_buffer_page(eb, i); 4464 page = extent_buffer_page(eb, i);
4028
4029 WARN_ON(!PagePrivate(page));
4030
4031 set_page_extent_mapped(page);
4032 if (i == 0)
4033 set_page_extent_head(page, eb->len);
4034
4035 if (inc_all_pages)
4036 page_cache_get(page);
4037 if (!PageUptodate(page)) { 4465 if (!PageUptodate(page)) {
4038 if (start_i == 0)
4039 inc_all_pages = 1;
4040 ClearPageError(page); 4466 ClearPageError(page);
4041 err = __extent_read_full_page(tree, page, 4467 err = __extent_read_full_page(tree, page,
4042 get_extent, &bio, 4468 get_extent, &bio,
@@ -4048,8 +4474,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
4048 } 4474 }
4049 } 4475 }
4050 4476
4051 if (bio) 4477 if (bio) {
4052 submit_one_bio(READ, bio, mirror_num, bio_flags); 4478 err = submit_one_bio(READ, bio, mirror_num, bio_flags);
4479 if (err)
4480 return err;
4481 }
4053 4482
4054 if (ret || wait != WAIT_COMPLETE) 4483 if (ret || wait != WAIT_COMPLETE)
4055 return ret; 4484 return ret;
@@ -4061,8 +4490,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
4061 ret = -EIO; 4490 ret = -EIO;
4062 } 4491 }
4063 4492
4064 if (!ret)
4065 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4066 return ret; 4493 return ret;
4067 4494
4068unlock_exit: 4495unlock_exit:
@@ -4304,15 +4731,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
4304{ 4731{
4305 char *dst_kaddr = page_address(dst_page); 4732 char *dst_kaddr = page_address(dst_page);
4306 char *src_kaddr; 4733 char *src_kaddr;
4734 int must_memmove = 0;
4307 4735
4308 if (dst_page != src_page) { 4736 if (dst_page != src_page) {
4309 src_kaddr = page_address(src_page); 4737 src_kaddr = page_address(src_page);
4310 } else { 4738 } else {
4311 src_kaddr = dst_kaddr; 4739 src_kaddr = dst_kaddr;
4312 BUG_ON(areas_overlap(src_off, dst_off, len)); 4740 if (areas_overlap(src_off, dst_off, len))
4741 must_memmove = 1;
4313 } 4742 }
4314 4743
4315 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 4744 if (must_memmove)
4745 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
4746 else
4747 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
4316} 4748}
4317 4749
4318void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 4750void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
@@ -4382,7 +4814,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4382 "len %lu len %lu\n", dst_offset, len, dst->len); 4814 "len %lu len %lu\n", dst_offset, len, dst->len);
4383 BUG_ON(1); 4815 BUG_ON(1);
4384 } 4816 }
4385 if (!areas_overlap(src_offset, dst_offset, len)) { 4817 if (dst_offset < src_offset) {
4386 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 4818 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4387 return; 4819 return;
4388 } 4820 }
@@ -4408,47 +4840,48 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4408 } 4840 }
4409} 4841}
4410 4842
4411static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 4843int try_release_extent_buffer(struct page *page, gfp_t mask)
4412{ 4844{
4413 struct extent_buffer *eb =
4414 container_of(head, struct extent_buffer, rcu_head);
4415
4416 btrfs_release_extent_buffer(eb);
4417}
4418
4419int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
4420{
4421 u64 start = page_offset(page);
4422 struct extent_buffer *eb; 4845 struct extent_buffer *eb;
4423 int ret = 1;
4424 4846
4425 spin_lock(&tree->buffer_lock); 4847 /*
4426 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4848 * We need to make sure noboody is attaching this page to an eb right
4427 if (!eb) { 4849 * now.
4428 spin_unlock(&tree->buffer_lock); 4850 */
4429 return ret; 4851 spin_lock(&page->mapping->private_lock);
4852 if (!PagePrivate(page)) {
4853 spin_unlock(&page->mapping->private_lock);
4854 return 1;
4430 } 4855 }
4431 4856
4432 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 4857 eb = (struct extent_buffer *)page->private;
4433 ret = 0; 4858 BUG_ON(!eb);
4434 goto out;
4435 }
4436 4859
4437 /* 4860 /*
4438 * set @eb->refs to 0 if it is already 1, and then release the @eb. 4861 * This is a little awful but should be ok, we need to make sure that
4439 * Or go back. 4862 * the eb doesn't disappear out from under us while we're looking at
4863 * this page.
4440 */ 4864 */
4441 if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { 4865 spin_lock(&eb->refs_lock);
4442 ret = 0; 4866 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
4443 goto out; 4867 spin_unlock(&eb->refs_lock);
4868 spin_unlock(&page->mapping->private_lock);
4869 return 0;
4444 } 4870 }
4871 spin_unlock(&page->mapping->private_lock);
4445 4872
4446 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4873 if ((mask & GFP_NOFS) == GFP_NOFS)
4447out: 4874 mask = GFP_NOFS;
4448 spin_unlock(&tree->buffer_lock);
4449 4875
4450 /* at this point we can safely release the extent buffer */ 4876 /*
4451 if (atomic_read(&eb->refs) == 0) 4877 * If tree ref isn't set then we know the ref on this eb is a real ref,
4452 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4878 * so just return, this page will likely be freed soon anyway.
4453 return ret; 4879 */
4880 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
4881 spin_unlock(&eb->refs_lock);
4882 return 0;
4883 }
4884 release_extent_buffer(eb, mask);
4885
4886 return 1;
4454} 4887}