aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c935
1 files changed, 735 insertions, 200 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7055d11c1efd..be1bf627a14b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17#include "compat.h" 17#include "compat.h"
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h"
20 21
21static struct kmem_cache *extent_state_cache; 22static struct kmem_cache *extent_state_cache;
22static struct kmem_cache *extent_buffer_cache; 23static struct kmem_cache *extent_buffer_cache;
@@ -254,14 +255,14 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
254 * 255 *
255 * This should be called with the tree lock held. 256 * This should be called with the tree lock held.
256 */ 257 */
257static int merge_state(struct extent_io_tree *tree, 258static void merge_state(struct extent_io_tree *tree,
258 struct extent_state *state) 259 struct extent_state *state)
259{ 260{
260 struct extent_state *other; 261 struct extent_state *other;
261 struct rb_node *other_node; 262 struct rb_node *other_node;
262 263
263 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 264 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
264 return 0; 265 return;
265 266
266 other_node = rb_prev(&state->rb_node); 267 other_node = rb_prev(&state->rb_node);
267 if (other_node) { 268 if (other_node) {
@@ -281,26 +282,19 @@ static int merge_state(struct extent_io_tree *tree,
281 if (other->start == state->end + 1 && 282 if (other->start == state->end + 1 &&
282 other->state == state->state) { 283 other->state == state->state) {
283 merge_cb(tree, state, other); 284 merge_cb(tree, state, other);
284 other->start = state->start; 285 state->end = other->end;
285 state->tree = NULL; 286 other->tree = NULL;
286 rb_erase(&state->rb_node, &tree->state); 287 rb_erase(&other->rb_node, &tree->state);
287 free_extent_state(state); 288 free_extent_state(other);
288 state = NULL;
289 } 289 }
290 } 290 }
291
292 return 0;
293} 291}
294 292
295static int set_state_cb(struct extent_io_tree *tree, 293static void set_state_cb(struct extent_io_tree *tree,
296 struct extent_state *state, int *bits) 294 struct extent_state *state, int *bits)
297{ 295{
298 if (tree->ops && tree->ops->set_bit_hook) { 296 if (tree->ops && tree->ops->set_bit_hook)
299 return tree->ops->set_bit_hook(tree->mapping->host, 297 tree->ops->set_bit_hook(tree->mapping->host, state, bits);
300 state, bits);
301 }
302
303 return 0;
304} 298}
305 299
306static void clear_state_cb(struct extent_io_tree *tree, 300static void clear_state_cb(struct extent_io_tree *tree,
@@ -310,6 +304,9 @@ static void clear_state_cb(struct extent_io_tree *tree,
310 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 304 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
311} 305}
312 306
307static void set_state_bits(struct extent_io_tree *tree,
308 struct extent_state *state, int *bits);
309
313/* 310/*
314 * insert an extent_state struct into the tree. 'bits' are set on the 311 * insert an extent_state struct into the tree. 'bits' are set on the
315 * struct before it is inserted. 312 * struct before it is inserted.
@@ -325,8 +322,6 @@ static int insert_state(struct extent_io_tree *tree,
325 int *bits) 322 int *bits)
326{ 323{
327 struct rb_node *node; 324 struct rb_node *node;
328 int bits_to_set = *bits & ~EXTENT_CTLBITS;
329 int ret;
330 325
331 if (end < start) { 326 if (end < start) {
332 printk(KERN_ERR "btrfs end < start %llu %llu\n", 327 printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -336,13 +331,9 @@ static int insert_state(struct extent_io_tree *tree,
336 } 331 }
337 state->start = start; 332 state->start = start;
338 state->end = end; 333 state->end = end;
339 ret = set_state_cb(tree, state, bits);
340 if (ret)
341 return ret;
342 334
343 if (bits_to_set & EXTENT_DIRTY) 335 set_state_bits(tree, state, bits);
344 tree->dirty_bytes += end - start + 1; 336
345 state->state |= bits_to_set;
346 node = tree_insert(&tree->state, end, &state->rb_node); 337 node = tree_insert(&tree->state, end, &state->rb_node);
347 if (node) { 338 if (node) {
348 struct extent_state *found; 339 struct extent_state *found;
@@ -351,7 +342,6 @@ static int insert_state(struct extent_io_tree *tree,
351 "%llu %llu\n", (unsigned long long)found->start, 342 "%llu %llu\n", (unsigned long long)found->start,
352 (unsigned long long)found->end, 343 (unsigned long long)found->end,
353 (unsigned long long)start, (unsigned long long)end); 344 (unsigned long long)start, (unsigned long long)end);
354 free_extent_state(state);
355 return -EEXIST; 345 return -EEXIST;
356 } 346 }
357 state->tree = tree; 347 state->tree = tree;
@@ -359,13 +349,11 @@ static int insert_state(struct extent_io_tree *tree,
359 return 0; 349 return 0;
360} 350}
361 351
362static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, 352static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
363 u64 split) 353 u64 split)
364{ 354{
365 if (tree->ops && tree->ops->split_extent_hook) 355 if (tree->ops && tree->ops->split_extent_hook)
366 return tree->ops->split_extent_hook(tree->mapping->host, 356 tree->ops->split_extent_hook(tree->mapping->host, orig, split);
367 orig, split);
368 return 0;
369} 357}
370 358
371/* 359/*
@@ -500,7 +488,8 @@ again:
500 cached_state = NULL; 488 cached_state = NULL;
501 } 489 }
502 490
503 if (cached && cached->tree && cached->start == start) { 491 if (cached && cached->tree && cached->start <= start &&
492 cached->end > start) {
504 if (clear) 493 if (clear)
505 atomic_dec(&cached->refs); 494 atomic_dec(&cached->refs);
506 state = cached; 495 state = cached;
@@ -660,34 +649,25 @@ again:
660 if (start > end) 649 if (start > end)
661 break; 650 break;
662 651
663 if (need_resched()) { 652 cond_resched_lock(&tree->lock);
664 spin_unlock(&tree->lock);
665 cond_resched();
666 spin_lock(&tree->lock);
667 }
668 } 653 }
669out: 654out:
670 spin_unlock(&tree->lock); 655 spin_unlock(&tree->lock);
671 return 0; 656 return 0;
672} 657}
673 658
674static int set_state_bits(struct extent_io_tree *tree, 659static void set_state_bits(struct extent_io_tree *tree,
675 struct extent_state *state, 660 struct extent_state *state,
676 int *bits) 661 int *bits)
677{ 662{
678 int ret;
679 int bits_to_set = *bits & ~EXTENT_CTLBITS; 663 int bits_to_set = *bits & ~EXTENT_CTLBITS;
680 664
681 ret = set_state_cb(tree, state, bits); 665 set_state_cb(tree, state, bits);
682 if (ret)
683 return ret;
684 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 666 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
685 u64 range = state->end - state->start + 1; 667 u64 range = state->end - state->start + 1;
686 tree->dirty_bytes += range; 668 tree->dirty_bytes += range;
687 } 669 }
688 state->state |= bits_to_set; 670 state->state |= bits_to_set;
689
690 return 0;
691} 671}
692 672
693static void cache_state(struct extent_state *state, 673static void cache_state(struct extent_state *state,
@@ -742,7 +722,8 @@ again:
742 spin_lock(&tree->lock); 722 spin_lock(&tree->lock);
743 if (cached_state && *cached_state) { 723 if (cached_state && *cached_state) {
744 state = *cached_state; 724 state = *cached_state;
745 if (state->start == start && state->tree) { 725 if (state->start <= start && state->end > start &&
726 state->tree) {
746 node = &state->rb_node; 727 node = &state->rb_node;
747 goto hit_next; 728 goto hit_next;
748 } 729 }
@@ -779,17 +760,15 @@ hit_next:
779 goto out; 760 goto out;
780 } 761 }
781 762
782 err = set_state_bits(tree, state, &bits); 763 set_state_bits(tree, state, &bits);
783 if (err)
784 goto out;
785 764
786 next_node = rb_next(node);
787 cache_state(state, cached_state); 765 cache_state(state, cached_state);
788 merge_state(tree, state); 766 merge_state(tree, state);
789 if (last_end == (u64)-1) 767 if (last_end == (u64)-1)
790 goto out; 768 goto out;
791 769
792 start = last_end + 1; 770 start = last_end + 1;
771 next_node = rb_next(&state->rb_node);
793 if (next_node && start < end && prealloc && !need_resched()) { 772 if (next_node && start < end && prealloc && !need_resched()) {
794 state = rb_entry(next_node, struct extent_state, 773 state = rb_entry(next_node, struct extent_state,
795 rb_node); 774 rb_node);
@@ -830,9 +809,7 @@ hit_next:
830 if (err) 809 if (err)
831 goto out; 810 goto out;
832 if (state->end <= end) { 811 if (state->end <= end) {
833 err = set_state_bits(tree, state, &bits); 812 set_state_bits(tree, state, &bits);
834 if (err)
835 goto out;
836 cache_state(state, cached_state); 813 cache_state(state, cached_state);
837 merge_state(tree, state); 814 merge_state(tree, state);
838 if (last_end == (u64)-1) 815 if (last_end == (u64)-1)
@@ -862,7 +839,6 @@ hit_next:
862 * Avoid to free 'prealloc' if it can be merged with 839 * Avoid to free 'prealloc' if it can be merged with
863 * the later extent. 840 * the later extent.
864 */ 841 */
865 atomic_inc(&prealloc->refs);
866 err = insert_state(tree, prealloc, start, this_end, 842 err = insert_state(tree, prealloc, start, this_end,
867 &bits); 843 &bits);
868 BUG_ON(err == -EEXIST); 844 BUG_ON(err == -EEXIST);
@@ -872,7 +848,6 @@ hit_next:
872 goto out; 848 goto out;
873 } 849 }
874 cache_state(prealloc, cached_state); 850 cache_state(prealloc, cached_state);
875 free_extent_state(prealloc);
876 prealloc = NULL; 851 prealloc = NULL;
877 start = this_end + 1; 852 start = this_end + 1;
878 goto search_again; 853 goto search_again;
@@ -895,12 +870,196 @@ hit_next:
895 err = split_state(tree, state, prealloc, end + 1); 870 err = split_state(tree, state, prealloc, end + 1);
896 BUG_ON(err == -EEXIST); 871 BUG_ON(err == -EEXIST);
897 872
898 err = set_state_bits(tree, prealloc, &bits); 873 set_state_bits(tree, prealloc, &bits);
874 cache_state(prealloc, cached_state);
875 merge_state(tree, prealloc);
876 prealloc = NULL;
877 goto out;
878 }
879
880 goto search_again;
881
882out:
883 spin_unlock(&tree->lock);
884 if (prealloc)
885 free_extent_state(prealloc);
886
887 return err;
888
889search_again:
890 if (start > end)
891 goto out;
892 spin_unlock(&tree->lock);
893 if (mask & __GFP_WAIT)
894 cond_resched();
895 goto again;
896}
897
898/**
899 * convert_extent - convert all bits in a given range from one bit to another
900 * @tree: the io tree to search
901 * @start: the start offset in bytes
902 * @end: the end offset in bytes (inclusive)
903 * @bits: the bits to set in this range
904 * @clear_bits: the bits to clear in this range
905 * @mask: the allocation mask
906 *
907 * This will go through and set bits for the given range. If any states exist
908 * already in this range they are set with the given bit and cleared of the
909 * clear_bits. This is only meant to be used by things that are mergeable, ie
910 * converting from say DELALLOC to DIRTY. This is not meant to be used with
911 * boundary bits like LOCK.
912 */
913int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
914 int bits, int clear_bits, gfp_t mask)
915{
916 struct extent_state *state;
917 struct extent_state *prealloc = NULL;
918 struct rb_node *node;
919 int err = 0;
920 u64 last_start;
921 u64 last_end;
922
923again:
924 if (!prealloc && (mask & __GFP_WAIT)) {
925 prealloc = alloc_extent_state(mask);
926 if (!prealloc)
927 return -ENOMEM;
928 }
929
930 spin_lock(&tree->lock);
931 /*
932 * this search will find all the extents that end after
933 * our range starts.
934 */
935 node = tree_search(tree, start);
936 if (!node) {
937 prealloc = alloc_extent_state_atomic(prealloc);
938 if (!prealloc)
939 return -ENOMEM;
940 err = insert_state(tree, prealloc, start, end, &bits);
941 prealloc = NULL;
942 BUG_ON(err == -EEXIST);
943 goto out;
944 }
945 state = rb_entry(node, struct extent_state, rb_node);
946hit_next:
947 last_start = state->start;
948 last_end = state->end;
949
950 /*
951 * | ---- desired range ---- |
952 * | state |
953 *
954 * Just lock what we found and keep going
955 */
956 if (state->start == start && state->end <= end) {
957 struct rb_node *next_node;
958
959 set_state_bits(tree, state, &bits);
960 clear_state_bit(tree, state, &clear_bits, 0);
961
962 merge_state(tree, state);
963 if (last_end == (u64)-1)
964 goto out;
965
966 start = last_end + 1;
967 next_node = rb_next(&state->rb_node);
968 if (next_node && start < end && prealloc && !need_resched()) {
969 state = rb_entry(next_node, struct extent_state,
970 rb_node);
971 if (state->start == start)
972 goto hit_next;
973 }
974 goto search_again;
975 }
976
977 /*
978 * | ---- desired range ---- |
979 * | state |
980 * or
981 * | ------------- state -------------- |
982 *
983 * We need to split the extent we found, and may flip bits on
984 * second half.
985 *
986 * If the extent we found extends past our
987 * range, we just split and search again. It'll get split
988 * again the next time though.
989 *
990 * If the extent we found is inside our range, we set the
991 * desired bit on it.
992 */
993 if (state->start < start) {
994 prealloc = alloc_extent_state_atomic(prealloc);
995 if (!prealloc)
996 return -ENOMEM;
997 err = split_state(tree, state, prealloc, start);
998 BUG_ON(err == -EEXIST);
999 prealloc = NULL;
1000 if (err)
1001 goto out;
1002 if (state->end <= end) {
1003 set_state_bits(tree, state, &bits);
1004 clear_state_bit(tree, state, &clear_bits, 0);
1005 merge_state(tree, state);
1006 if (last_end == (u64)-1)
1007 goto out;
1008 start = last_end + 1;
1009 }
1010 goto search_again;
1011 }
1012 /*
1013 * | ---- desired range ---- |
1014 * | state | or | state |
1015 *
1016 * There's a hole, we need to insert something in it and
1017 * ignore the extent we found.
1018 */
1019 if (state->start > start) {
1020 u64 this_end;
1021 if (end < last_start)
1022 this_end = end;
1023 else
1024 this_end = last_start - 1;
1025
1026 prealloc = alloc_extent_state_atomic(prealloc);
1027 if (!prealloc)
1028 return -ENOMEM;
1029
1030 /*
1031 * Avoid to free 'prealloc' if it can be merged with
1032 * the later extent.
1033 */
1034 err = insert_state(tree, prealloc, start, this_end,
1035 &bits);
1036 BUG_ON(err == -EEXIST);
899 if (err) { 1037 if (err) {
1038 free_extent_state(prealloc);
900 prealloc = NULL; 1039 prealloc = NULL;
901 goto out; 1040 goto out;
902 } 1041 }
903 cache_state(prealloc, cached_state); 1042 prealloc = NULL;
1043 start = this_end + 1;
1044 goto search_again;
1045 }
1046 /*
1047 * | ---- desired range ---- |
1048 * | state |
1049 * We need to split the extent, and set the bit
1050 * on the first half
1051 */
1052 if (state->start <= end && state->end > end) {
1053 prealloc = alloc_extent_state_atomic(prealloc);
1054 if (!prealloc)
1055 return -ENOMEM;
1056
1057 err = split_state(tree, state, prealloc, end + 1);
1058 BUG_ON(err == -EEXIST);
1059
1060 set_state_bits(tree, prealloc, &bits);
1061 clear_state_bit(tree, prealloc, &clear_bits, 0);
1062
904 merge_state(tree, prealloc); 1063 merge_state(tree, prealloc);
905 prealloc = NULL; 1064 prealloc = NULL;
906 goto out; 1065 goto out;
@@ -949,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
949 struct extent_state **cached_state, gfp_t mask) 1108 struct extent_state **cached_state, gfp_t mask)
950{ 1109{
951 return set_extent_bit(tree, start, end, 1110 return set_extent_bit(tree, start, end,
952 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 1111 EXTENT_DELALLOC | EXTENT_UPTODATE,
953 0, NULL, cached_state, mask); 1112 0, NULL, cached_state, mask);
954} 1113}
955 1114
@@ -1061,46 +1220,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1061 return 0; 1220 return 0;
1062} 1221}
1063 1222
1064/*
1065 * find the first offset in the io tree with 'bits' set. zero is
1066 * returned if we find something, and *start_ret and *end_ret are
1067 * set to reflect the state struct that was found.
1068 *
1069 * If nothing was found, 1 is returned, < 0 on error
1070 */
1071int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1072 u64 *start_ret, u64 *end_ret, int bits)
1073{
1074 struct rb_node *node;
1075 struct extent_state *state;
1076 int ret = 1;
1077
1078 spin_lock(&tree->lock);
1079 /*
1080 * this search will find all the extents that end after
1081 * our range starts.
1082 */
1083 node = tree_search(tree, start);
1084 if (!node)
1085 goto out;
1086
1087 while (1) {
1088 state = rb_entry(node, struct extent_state, rb_node);
1089 if (state->end >= start && (state->state & bits)) {
1090 *start_ret = state->start;
1091 *end_ret = state->end;
1092 ret = 0;
1093 break;
1094 }
1095 node = rb_next(node);
1096 if (!node)
1097 break;
1098 }
1099out:
1100 spin_unlock(&tree->lock);
1101 return ret;
1102}
1103
1104/* find the first state struct with 'bits' set after 'start', and 1223/* find the first state struct with 'bits' set after 'start', and
1105 * return it. tree->lock must be held. NULL will returned if 1224 * return it. tree->lock must be held. NULL will returned if
1106 * nothing was found after 'start' 1225 * nothing was found after 'start'
@@ -1133,6 +1252,30 @@ out:
1133} 1252}
1134 1253
1135/* 1254/*
1255 * find the first offset in the io tree with 'bits' set. zero is
1256 * returned if we find something, and *start_ret and *end_ret are
1257 * set to reflect the state struct that was found.
1258 *
1259 * If nothing was found, 1 is returned, < 0 on error
1260 */
1261int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1262 u64 *start_ret, u64 *end_ret, int bits)
1263{
1264 struct extent_state *state;
1265 int ret = 1;
1266
1267 spin_lock(&tree->lock);
1268 state = find_first_extent_bit_state(tree, start, bits);
1269 if (state) {
1270 *start_ret = state->start;
1271 *end_ret = state->end;
1272 ret = 0;
1273 }
1274 spin_unlock(&tree->lock);
1275 return ret;
1276}
1277
1278/*
1136 * find a contiguous range of bytes in the file marked as delalloc, not 1279 * find a contiguous range of bytes in the file marked as delalloc, not
1137 * more than 'max_bytes'. start and end are used to return the range, 1280 * more than 'max_bytes'. start and end are used to return the range,
1138 * 1281 *
@@ -1564,7 +1707,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1564 int bitset = 0; 1707 int bitset = 0;
1565 1708
1566 spin_lock(&tree->lock); 1709 spin_lock(&tree->lock);
1567 if (cached && cached->tree && cached->start == start) 1710 if (cached && cached->tree && cached->start <= start &&
1711 cached->end > start)
1568 node = &cached->rb_node; 1712 node = &cached->rb_node;
1569 else 1713 else
1570 node = tree_search(tree, start); 1714 node = tree_search(tree, start);
@@ -1644,6 +1788,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1644 return 0; 1788 return 0;
1645} 1789}
1646 1790
1791/*
1792 * When IO fails, either with EIO or csum verification fails, we
1793 * try other mirrors that might have a good copy of the data. This
1794 * io_failure_record is used to record state as we go through all the
1795 * mirrors. If another mirror has good data, the page is set up to date
1796 * and things continue. If a good mirror can't be found, the original
1797 * bio end_io callback is called to indicate things have failed.
1798 */
1799struct io_failure_record {
1800 struct page *page;
1801 u64 start;
1802 u64 len;
1803 u64 logical;
1804 unsigned long bio_flags;
1805 int this_mirror;
1806 int failed_mirror;
1807 int in_validation;
1808};
1809
1810static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1811 int did_repair)
1812{
1813 int ret;
1814 int err = 0;
1815 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1816
1817 set_state_private(failure_tree, rec->start, 0);
1818 ret = clear_extent_bits(failure_tree, rec->start,
1819 rec->start + rec->len - 1,
1820 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1821 if (ret)
1822 err = ret;
1823
1824 if (did_repair) {
1825 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1826 rec->start + rec->len - 1,
1827 EXTENT_DAMAGED, GFP_NOFS);
1828 if (ret && !err)
1829 err = ret;
1830 }
1831
1832 kfree(rec);
1833 return err;
1834}
1835
1836static void repair_io_failure_callback(struct bio *bio, int err)
1837{
1838 complete(bio->bi_private);
1839}
1840
1841/*
1842 * this bypasses the standard btrfs submit functions deliberately, as
1843 * the standard behavior is to write all copies in a raid setup. here we only
1844 * want to write the one bad copy. so we do the mapping for ourselves and issue
1845 * submit_bio directly.
1846 * to avoid any synchonization issues, wait for the data after writing, which
1847 * actually prevents the read that triggered the error from finishing.
1848 * currently, there can be no more than two copies of every data bit. thus,
1849 * exactly one rewrite is required.
1850 */
1851int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1852 u64 length, u64 logical, struct page *page,
1853 int mirror_num)
1854{
1855 struct bio *bio;
1856 struct btrfs_device *dev;
1857 DECLARE_COMPLETION_ONSTACK(compl);
1858 u64 map_length = 0;
1859 u64 sector;
1860 struct btrfs_bio *bbio = NULL;
1861 int ret;
1862
1863 BUG_ON(!mirror_num);
1864
1865 bio = bio_alloc(GFP_NOFS, 1);
1866 if (!bio)
1867 return -EIO;
1868 bio->bi_private = &compl;
1869 bio->bi_end_io = repair_io_failure_callback;
1870 bio->bi_size = 0;
1871 map_length = length;
1872
1873 ret = btrfs_map_block(map_tree, WRITE, logical,
1874 &map_length, &bbio, mirror_num);
1875 if (ret) {
1876 bio_put(bio);
1877 return -EIO;
1878 }
1879 BUG_ON(mirror_num != bbio->mirror_num);
1880 sector = bbio->stripes[mirror_num-1].physical >> 9;
1881 bio->bi_sector = sector;
1882 dev = bbio->stripes[mirror_num-1].dev;
1883 kfree(bbio);
1884 if (!dev || !dev->bdev || !dev->writeable) {
1885 bio_put(bio);
1886 return -EIO;
1887 }
1888 bio->bi_bdev = dev->bdev;
1889 bio_add_page(bio, page, length, start-page_offset(page));
1890 submit_bio(WRITE_SYNC, bio);
1891 wait_for_completion(&compl);
1892
1893 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1894 /* try to remap that extent elsewhere? */
1895 bio_put(bio);
1896 return -EIO;
1897 }
1898
1899 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1900 "sector %llu)\n", page->mapping->host->i_ino, start,
1901 dev->name, sector);
1902
1903 bio_put(bio);
1904 return 0;
1905}
1906
1907/*
1908 * each time an IO finishes, we do a fast check in the IO failure tree
1909 * to see if we need to process or clean up an io_failure_record
1910 */
1911static int clean_io_failure(u64 start, struct page *page)
1912{
1913 u64 private;
1914 u64 private_failure;
1915 struct io_failure_record *failrec;
1916 struct btrfs_mapping_tree *map_tree;
1917 struct extent_state *state;
1918 int num_copies;
1919 int did_repair = 0;
1920 int ret;
1921 struct inode *inode = page->mapping->host;
1922
1923 private = 0;
1924 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1925 (u64)-1, 1, EXTENT_DIRTY, 0);
1926 if (!ret)
1927 return 0;
1928
1929 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1930 &private_failure);
1931 if (ret)
1932 return 0;
1933
1934 failrec = (struct io_failure_record *)(unsigned long) private_failure;
1935 BUG_ON(!failrec->this_mirror);
1936
1937 if (failrec->in_validation) {
1938 /* there was no real error, just free the record */
1939 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1940 failrec->start);
1941 did_repair = 1;
1942 goto out;
1943 }
1944
1945 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1946 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1947 failrec->start,
1948 EXTENT_LOCKED);
1949 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1950
1951 if (state && state->start == failrec->start) {
1952 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1953 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1954 failrec->len);
1955 if (num_copies > 1) {
1956 ret = repair_io_failure(map_tree, start, failrec->len,
1957 failrec->logical, page,
1958 failrec->failed_mirror);
1959 did_repair = !ret;
1960 }
1961 }
1962
1963out:
1964 if (!ret)
1965 ret = free_io_failure(inode, failrec, did_repair);
1966
1967 return ret;
1968}
1969
1970/*
1971 * this is a generic handler for readpage errors (default
1972 * readpage_io_failed_hook). if other copies exist, read those and write back
1973 * good data to the failed position. does not investigate in remapping the
1974 * failed extent elsewhere, hoping the device will be smart enough to do this as
1975 * needed
1976 */
1977
1978static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1979 u64 start, u64 end, int failed_mirror,
1980 struct extent_state *state)
1981{
1982 struct io_failure_record *failrec = NULL;
1983 u64 private;
1984 struct extent_map *em;
1985 struct inode *inode = page->mapping->host;
1986 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1987 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1988 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1989 struct bio *bio;
1990 int num_copies;
1991 int ret;
1992 int read_mode;
1993 u64 logical;
1994
1995 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
1996
1997 ret = get_state_private(failure_tree, start, &private);
1998 if (ret) {
1999 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2000 if (!failrec)
2001 return -ENOMEM;
2002 failrec->start = start;
2003 failrec->len = end - start + 1;
2004 failrec->this_mirror = 0;
2005 failrec->bio_flags = 0;
2006 failrec->in_validation = 0;
2007
2008 read_lock(&em_tree->lock);
2009 em = lookup_extent_mapping(em_tree, start, failrec->len);
2010 if (!em) {
2011 read_unlock(&em_tree->lock);
2012 kfree(failrec);
2013 return -EIO;
2014 }
2015
2016 if (em->start > start || em->start + em->len < start) {
2017 free_extent_map(em);
2018 em = NULL;
2019 }
2020 read_unlock(&em_tree->lock);
2021
2022 if (!em || IS_ERR(em)) {
2023 kfree(failrec);
2024 return -EIO;
2025 }
2026 logical = start - em->start;
2027 logical = em->block_start + logical;
2028 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2029 logical = em->block_start;
2030 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2031 extent_set_compress_type(&failrec->bio_flags,
2032 em->compress_type);
2033 }
2034 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2035 "len=%llu\n", logical, start, failrec->len);
2036 failrec->logical = logical;
2037 free_extent_map(em);
2038
2039 /* set the bits in the private failure tree */
2040 ret = set_extent_bits(failure_tree, start, end,
2041 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2042 if (ret >= 0)
2043 ret = set_state_private(failure_tree, start,
2044 (u64)(unsigned long)failrec);
2045 /* set the bits in the inode's tree */
2046 if (ret >= 0)
2047 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2048 GFP_NOFS);
2049 if (ret < 0) {
2050 kfree(failrec);
2051 return ret;
2052 }
2053 } else {
2054 failrec = (struct io_failure_record *)(unsigned long)private;
2055 pr_debug("bio_readpage_error: (found) logical=%llu, "
2056 "start=%llu, len=%llu, validation=%d\n",
2057 failrec->logical, failrec->start, failrec->len,
2058 failrec->in_validation);
2059 /*
2060 * when data can be on disk more than twice, add to failrec here
2061 * (e.g. with a list for failed_mirror) to make
2062 * clean_io_failure() clean all those errors at once.
2063 */
2064 }
2065 num_copies = btrfs_num_copies(
2066 &BTRFS_I(inode)->root->fs_info->mapping_tree,
2067 failrec->logical, failrec->len);
2068 if (num_copies == 1) {
2069 /*
2070 * we only have a single copy of the data, so don't bother with
2071 * all the retry and error correction code that follows. no
2072 * matter what the error is, it is very likely to persist.
2073 */
2074 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2075 "state=%p, num_copies=%d, next_mirror %d, "
2076 "failed_mirror %d\n", state, num_copies,
2077 failrec->this_mirror, failed_mirror);
2078 free_io_failure(inode, failrec, 0);
2079 return -EIO;
2080 }
2081
2082 if (!state) {
2083 spin_lock(&tree->lock);
2084 state = find_first_extent_bit_state(tree, failrec->start,
2085 EXTENT_LOCKED);
2086 if (state && state->start != failrec->start)
2087 state = NULL;
2088 spin_unlock(&tree->lock);
2089 }
2090
2091 /*
2092 * there are two premises:
2093 * a) deliver good data to the caller
2094 * b) correct the bad sectors on disk
2095 */
2096 if (failed_bio->bi_vcnt > 1) {
2097 /*
2098 * to fulfill b), we need to know the exact failing sectors, as
2099 * we don't want to rewrite any more than the failed ones. thus,
2100 * we need separate read requests for the failed bio
2101 *
2102 * if the following BUG_ON triggers, our validation request got
2103 * merged. we need separate requests for our algorithm to work.
2104 */
2105 BUG_ON(failrec->in_validation);
2106 failrec->in_validation = 1;
2107 failrec->this_mirror = failed_mirror;
2108 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2109 } else {
2110 /*
2111 * we're ready to fulfill a) and b) alongside. get a good copy
2112 * of the failed sector and if we succeed, we have setup
2113 * everything for repair_io_failure to do the rest for us.
2114 */
2115 if (failrec->in_validation) {
2116 BUG_ON(failrec->this_mirror != failed_mirror);
2117 failrec->in_validation = 0;
2118 failrec->this_mirror = 0;
2119 }
2120 failrec->failed_mirror = failed_mirror;
2121 failrec->this_mirror++;
2122 if (failrec->this_mirror == failed_mirror)
2123 failrec->this_mirror++;
2124 read_mode = READ_SYNC;
2125 }
2126
2127 if (!state || failrec->this_mirror > num_copies) {
2128 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2129 "next_mirror %d, failed_mirror %d\n", state,
2130 num_copies, failrec->this_mirror, failed_mirror);
2131 free_io_failure(inode, failrec, 0);
2132 return -EIO;
2133 }
2134
2135 bio = bio_alloc(GFP_NOFS, 1);
2136 bio->bi_private = state;
2137 bio->bi_end_io = failed_bio->bi_end_io;
2138 bio->bi_sector = failrec->logical >> 9;
2139 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2140 bio->bi_size = 0;
2141
2142 bio_add_page(bio, page, failrec->len, start - page_offset(page));
2143
2144 pr_debug("bio_readpage_error: submitting new read[%#x] to "
2145 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2146 failrec->this_mirror, num_copies, failrec->in_validation);
2147
2148 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
2149 failrec->bio_flags, 0);
2150 return 0;
2151}
2152
1647/* lots and lots of room for performance fixes in the end_bio funcs */ 2153/* lots and lots of room for performance fixes in the end_bio funcs */
1648 2154
1649/* 2155/*
@@ -1742,6 +2248,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1742 struct extent_state *cached = NULL; 2248 struct extent_state *cached = NULL;
1743 struct extent_state *state; 2249 struct extent_state *state;
1744 2250
2251 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2252 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2253 (long int)bio->bi_bdev);
1745 tree = &BTRFS_I(page->mapping->host)->io_tree; 2254 tree = &BTRFS_I(page->mapping->host)->io_tree;
1746 2255
1747 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2256 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1772,12 +2281,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1772 state); 2281 state);
1773 if (ret) 2282 if (ret)
1774 uptodate = 0; 2283 uptodate = 0;
2284 else
2285 clean_io_failure(start, page);
1775 } 2286 }
1776 if (!uptodate && tree->ops && 2287 if (!uptodate) {
1777 tree->ops->readpage_io_failed_hook) { 2288 int failed_mirror;
1778 ret = tree->ops->readpage_io_failed_hook(bio, page, 2289 failed_mirror = (int)(unsigned long)bio->bi_bdev;
1779 start, end, NULL); 2290 /*
2291 * The generic bio_readpage_error handles errors the
2292 * following way: If possible, new read requests are
2293 * created and submitted and will end up in
2294 * end_bio_extent_readpage as well (if we're lucky, not
2295 * in the !uptodate case). In that case it returns 0 and
2296 * we just go on with the next page in our bio. If it
2297 * can't handle the error it will return -EIO and we
2298 * remain responsible for that page.
2299 */
2300 ret = bio_readpage_error(bio, page, start, end,
2301 failed_mirror, NULL);
1780 if (ret == 0) { 2302 if (ret == 0) {
2303error_handled:
1781 uptodate = 2304 uptodate =
1782 test_bit(BIO_UPTODATE, &bio->bi_flags); 2305 test_bit(BIO_UPTODATE, &bio->bi_flags);
1783 if (err) 2306 if (err)
@@ -1785,6 +2308,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1785 uncache_state(&cached); 2308 uncache_state(&cached);
1786 continue; 2309 continue;
1787 } 2310 }
2311 if (tree->ops && tree->ops->readpage_io_failed_hook) {
2312 ret = tree->ops->readpage_io_failed_hook(
2313 bio, page, start, end,
2314 failed_mirror, state);
2315 if (ret == 0)
2316 goto error_handled;
2317 }
1788 } 2318 }
1789 2319
1790 if (uptodate) { 2320 if (uptodate) {
@@ -1856,6 +2386,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1856 mirror_num, bio_flags, start); 2386 mirror_num, bio_flags, start);
1857 else 2387 else
1858 submit_bio(rw, bio); 2388 submit_bio(rw, bio);
2389
1859 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2390 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1860 ret = -EOPNOTSUPP; 2391 ret = -EOPNOTSUPP;
1861 bio_put(bio); 2392 bio_put(bio);
@@ -2121,16 +2652,16 @@ out:
2121} 2652}
2122 2653
2123int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2654int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2124 get_extent_t *get_extent) 2655 get_extent_t *get_extent, int mirror_num)
2125{ 2656{
2126 struct bio *bio = NULL; 2657 struct bio *bio = NULL;
2127 unsigned long bio_flags = 0; 2658 unsigned long bio_flags = 0;
2128 int ret; 2659 int ret;
2129 2660
2130 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2661 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2131 &bio_flags); 2662 &bio_flags);
2132 if (bio) 2663 if (bio)
2133 ret = submit_one_bio(READ, bio, 0, bio_flags); 2664 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2134 return ret; 2665 return ret;
2135} 2666}
2136 2667
@@ -2181,6 +2712,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2181 int compressed; 2712 int compressed;
2182 int write_flags; 2713 int write_flags;
2183 unsigned long nr_written = 0; 2714 unsigned long nr_written = 0;
2715 bool fill_delalloc = true;
2184 2716
2185 if (wbc->sync_mode == WB_SYNC_ALL) 2717 if (wbc->sync_mode == WB_SYNC_ALL)
2186 write_flags = WRITE_SYNC; 2718 write_flags = WRITE_SYNC;
@@ -2190,6 +2722,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2190 trace___extent_writepage(page, inode, wbc); 2722 trace___extent_writepage(page, inode, wbc);
2191 2723
2192 WARN_ON(!PageLocked(page)); 2724 WARN_ON(!PageLocked(page));
2725
2726 ClearPageError(page);
2727
2193 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2728 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2194 if (page->index > end_index || 2729 if (page->index > end_index ||
2195 (page->index == end_index && !pg_offset)) { 2730 (page->index == end_index && !pg_offset)) {
@@ -2211,10 +2746,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2211 2746
2212 set_page_extent_mapped(page); 2747 set_page_extent_mapped(page);
2213 2748
2749 if (!tree->ops || !tree->ops->fill_delalloc)
2750 fill_delalloc = false;
2751
2214 delalloc_start = start; 2752 delalloc_start = start;
2215 delalloc_end = 0; 2753 delalloc_end = 0;
2216 page_started = 0; 2754 page_started = 0;
2217 if (!epd->extent_locked) { 2755 if (!epd->extent_locked && fill_delalloc) {
2218 u64 delalloc_to_write = 0; 2756 u64 delalloc_to_write = 0;
2219 /* 2757 /*
2220 * make sure the wbc mapping index is at least updated 2758 * make sure the wbc mapping index is at least updated
@@ -2432,6 +2970,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2432 pgoff_t index; 2970 pgoff_t index;
2433 pgoff_t end; /* Inclusive */ 2971 pgoff_t end; /* Inclusive */
2434 int scanned = 0; 2972 int scanned = 0;
2973 int tag;
2435 2974
2436 pagevec_init(&pvec, 0); 2975 pagevec_init(&pvec, 0);
2437 if (wbc->range_cyclic) { 2976 if (wbc->range_cyclic) {
@@ -2442,11 +2981,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2442 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2981 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2443 scanned = 1; 2982 scanned = 1;
2444 } 2983 }
2984 if (wbc->sync_mode == WB_SYNC_ALL)
2985 tag = PAGECACHE_TAG_TOWRITE;
2986 else
2987 tag = PAGECACHE_TAG_DIRTY;
2445retry: 2988retry:
2989 if (wbc->sync_mode == WB_SYNC_ALL)
2990 tag_pages_for_writeback(mapping, index, end);
2446 while (!done && !nr_to_write_done && (index <= end) && 2991 while (!done && !nr_to_write_done && (index <= end) &&
2447 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2992 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2448 PAGECACHE_TAG_DIRTY, min(end - index, 2993 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2449 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2450 unsigned i; 2994 unsigned i;
2451 2995
2452 scanned = 1; 2996 scanned = 1;
@@ -2460,10 +3004,16 @@ retry:
2460 * swizzled back from swapper_space to tmpfs file 3004 * swizzled back from swapper_space to tmpfs file
2461 * mapping 3005 * mapping
2462 */ 3006 */
2463 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 3007 if (tree->ops &&
2464 tree->ops->write_cache_pages_lock_hook(page); 3008 tree->ops->write_cache_pages_lock_hook) {
2465 else 3009 tree->ops->write_cache_pages_lock_hook(page,
2466 lock_page(page); 3010 data, flush_fn);
3011 } else {
3012 if (!trylock_page(page)) {
3013 flush_fn(data);
3014 lock_page(page);
3015 }
3016 }
2467 3017
2468 if (unlikely(page->mapping != mapping)) { 3018 if (unlikely(page->mapping != mapping)) {
2469 unlock_page(page); 3019 unlock_page(page);
@@ -2541,7 +3091,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2541 struct writeback_control *wbc) 3091 struct writeback_control *wbc)
2542{ 3092{
2543 int ret; 3093 int ret;
2544 struct address_space *mapping = page->mapping;
2545 struct extent_page_data epd = { 3094 struct extent_page_data epd = {
2546 .bio = NULL, 3095 .bio = NULL,
2547 .tree = tree, 3096 .tree = tree,
@@ -2549,18 +3098,9 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2549 .extent_locked = 0, 3098 .extent_locked = 0,
2550 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3099 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2551 }; 3100 };
2552 struct writeback_control wbc_writepages = {
2553 .sync_mode = wbc->sync_mode,
2554 .older_than_this = NULL,
2555 .nr_to_write = 64,
2556 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2557 .range_end = (loff_t)-1,
2558 };
2559 3101
2560 ret = __extent_writepage(page, wbc, &epd); 3102 ret = __extent_writepage(page, wbc, &epd);
2561 3103
2562 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2563 __extent_writepage, &epd, flush_write_bio);
2564 flush_epd_write_bio(&epd); 3104 flush_epd_write_bio(&epd);
2565 return ret; 3105 return ret;
2566} 3106}
@@ -2584,7 +3124,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2584 }; 3124 };
2585 struct writeback_control wbc_writepages = { 3125 struct writeback_control wbc_writepages = {
2586 .sync_mode = mode, 3126 .sync_mode = mode,
2587 .older_than_this = NULL,
2588 .nr_to_write = nr_pages * 2, 3127 .nr_to_write = nr_pages * 2,
2589 .range_start = start, 3128 .range_start = start,
2590 .range_end = end + 1, 3129 .range_end = end + 1,
@@ -2840,6 +3379,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2840 return -ENOMEM; 3379 return -ENOMEM;
2841 path->leave_spinning = 1; 3380 path->leave_spinning = 1;
2842 3381
3382 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3383 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3384
2843 /* 3385 /*
2844 * lookup the last file extent. We're not using i_size here 3386 * lookup the last file extent. We're not using i_size here
2845 * because there might be preallocation past i_size 3387 * because there might be preallocation past i_size
@@ -2887,7 +3429,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2887 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3429 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2888 &cached_state, GFP_NOFS); 3430 &cached_state, GFP_NOFS);
2889 3431
2890 em = get_extent_skip_holes(inode, off, last_for_get_extent, 3432 em = get_extent_skip_holes(inode, start, last_for_get_extent,
2891 get_extent); 3433 get_extent);
2892 if (!em) 3434 if (!em)
2893 goto out; 3435 goto out;
@@ -2976,7 +3518,7 @@ out:
2976 return ret; 3518 return ret;
2977} 3519}
2978 3520
2979static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3521inline struct page *extent_buffer_page(struct extent_buffer *eb,
2980 unsigned long i) 3522 unsigned long i)
2981{ 3523{
2982 struct page *p; 3524 struct page *p;
@@ -3001,7 +3543,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
3001 return p; 3543 return p;
3002} 3544}
3003 3545
3004static inline unsigned long num_extent_pages(u64 start, u64 len) 3546inline unsigned long num_extent_pages(u64 start, u64 len)
3005{ 3547{
3006 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3548 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3007 (start >> PAGE_CACHE_SHIFT); 3549 (start >> PAGE_CACHE_SHIFT);
@@ -3022,8 +3564,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3022 return NULL; 3564 return NULL;
3023 eb->start = start; 3565 eb->start = start;
3024 eb->len = len; 3566 eb->len = len;
3025 spin_lock_init(&eb->lock); 3567 rwlock_init(&eb->lock);
3026 init_waitqueue_head(&eb->lock_wq); 3568 atomic_set(&eb->write_locks, 0);
3569 atomic_set(&eb->read_locks, 0);
3570 atomic_set(&eb->blocking_readers, 0);
3571 atomic_set(&eb->blocking_writers, 0);
3572 atomic_set(&eb->spinning_readers, 0);
3573 atomic_set(&eb->spinning_writers, 0);
3574 init_waitqueue_head(&eb->write_lock_wq);
3575 init_waitqueue_head(&eb->read_lock_wq);
3027 3576
3028#if LEAK_DEBUG 3577#if LEAK_DEBUG
3029 spin_lock_irqsave(&leak_lock, flags); 3578 spin_lock_irqsave(&leak_lock, flags);
@@ -3119,7 +3668,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3119 i = 0; 3668 i = 0;
3120 } 3669 }
3121 for (; i < num_pages; i++, index++) { 3670 for (; i < num_pages; i++, index++) {
3122 p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM); 3671 p = find_or_create_page(mapping, index, GFP_NOFS);
3123 if (!p) { 3672 if (!p) {
3124 WARN_ON(1); 3673 WARN_ON(1);
3125 goto free_eb; 3674 goto free_eb;
@@ -3247,6 +3796,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3247 PAGECACHE_TAG_DIRTY); 3796 PAGECACHE_TAG_DIRTY);
3248 } 3797 }
3249 spin_unlock_irq(&page->mapping->tree_lock); 3798 spin_unlock_irq(&page->mapping->tree_lock);
3799 ClearPageError(page);
3250 unlock_page(page); 3800 unlock_page(page);
3251 } 3801 }
3252 return 0; 3802 return 0;
@@ -3266,6 +3816,22 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3266 return was_dirty; 3816 return was_dirty;
3267} 3817}
3268 3818
3819static int __eb_straddles_pages(u64 start, u64 len)
3820{
3821 if (len < PAGE_CACHE_SIZE)
3822 return 1;
3823 if (start & (PAGE_CACHE_SIZE - 1))
3824 return 1;
3825 if ((start + len) & (PAGE_CACHE_SIZE - 1))
3826 return 1;
3827 return 0;
3828}
3829
3830static int eb_straddles_pages(struct extent_buffer *eb)
3831{
3832 return __eb_straddles_pages(eb->start, eb->len);
3833}
3834
3269int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3835int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3270 struct extent_buffer *eb, 3836 struct extent_buffer *eb,
3271 struct extent_state **cached_state) 3837 struct extent_state **cached_state)
@@ -3277,8 +3843,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3277 num_pages = num_extent_pages(eb->start, eb->len); 3843 num_pages = num_extent_pages(eb->start, eb->len);
3278 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3844 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3279 3845
3280 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3846 if (eb_straddles_pages(eb)) {
3281 cached_state, GFP_NOFS); 3847 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3848 cached_state, GFP_NOFS);
3849 }
3282 for (i = 0; i < num_pages; i++) { 3850 for (i = 0; i < num_pages; i++) {
3283 page = extent_buffer_page(eb, i); 3851 page = extent_buffer_page(eb, i);
3284 if (page) 3852 if (page)
@@ -3296,8 +3864,10 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3296 3864
3297 num_pages = num_extent_pages(eb->start, eb->len); 3865 num_pages = num_extent_pages(eb->start, eb->len);
3298 3866
3299 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3867 if (eb_straddles_pages(eb)) {
3300 NULL, GFP_NOFS); 3868 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3869 NULL, GFP_NOFS);
3870 }
3301 for (i = 0; i < num_pages; i++) { 3871 for (i = 0; i < num_pages; i++) {
3302 page = extent_buffer_page(eb, i); 3872 page = extent_buffer_page(eb, i);
3303 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3873 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3320,9 +3890,12 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3320 int uptodate; 3890 int uptodate;
3321 unsigned long index; 3891 unsigned long index;
3322 3892
3323 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); 3893 if (__eb_straddles_pages(start, end - start + 1)) {
3324 if (ret) 3894 ret = test_range_bit(tree, start, end,
3325 return 1; 3895 EXTENT_UPTODATE, 1, NULL);
3896 if (ret)
3897 return 1;
3898 }
3326 while (start <= end) { 3899 while (start <= end) {
3327 index = start >> PAGE_CACHE_SHIFT; 3900 index = start >> PAGE_CACHE_SHIFT;
3328 page = find_get_page(tree->mapping, index); 3901 page = find_get_page(tree->mapping, index);
@@ -3350,10 +3923,12 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3350 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3923 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3351 return 1; 3924 return 1;
3352 3925
3353 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3926 if (eb_straddles_pages(eb)) {
3354 EXTENT_UPTODATE, 1, cached_state); 3927 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3355 if (ret) 3928 EXTENT_UPTODATE, 1, cached_state);
3356 return ret; 3929 if (ret)
3930 return ret;
3931 }
3357 3932
3358 num_pages = num_extent_pages(eb->start, eb->len); 3933 num_pages = num_extent_pages(eb->start, eb->len);
3359 for (i = 0; i < num_pages; i++) { 3934 for (i = 0; i < num_pages; i++) {
@@ -3367,8 +3942,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3367} 3942}
3368 3943
3369int read_extent_buffer_pages(struct extent_io_tree *tree, 3944int read_extent_buffer_pages(struct extent_io_tree *tree,
3370 struct extent_buffer *eb, 3945 struct extent_buffer *eb, u64 start, int wait,
3371 u64 start, int wait,
3372 get_extent_t *get_extent, int mirror_num) 3946 get_extent_t *get_extent, int mirror_num)
3373{ 3947{
3374 unsigned long i; 3948 unsigned long i;
@@ -3386,9 +3960,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3386 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3960 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3387 return 0; 3961 return 0;
3388 3962
3389 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3963 if (eb_straddles_pages(eb)) {
3390 EXTENT_UPTODATE, 1, NULL)) { 3964 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3391 return 0; 3965 EXTENT_UPTODATE, 1, NULL)) {
3966 return 0;
3967 }
3392 } 3968 }
3393 3969
3394 if (start) { 3970 if (start) {
@@ -3402,7 +3978,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3402 num_pages = num_extent_pages(eb->start, eb->len); 3978 num_pages = num_extent_pages(eb->start, eb->len);
3403 for (i = start_i; i < num_pages; i++) { 3979 for (i = start_i; i < num_pages; i++) {
3404 page = extent_buffer_page(eb, i); 3980 page = extent_buffer_page(eb, i);
3405 if (!wait) { 3981 if (wait == WAIT_NONE) {
3406 if (!trylock_page(page)) 3982 if (!trylock_page(page))
3407 goto unlock_exit; 3983 goto unlock_exit;
3408 } else { 3984 } else {
@@ -3446,7 +4022,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3446 if (bio) 4022 if (bio)
3447 submit_one_bio(READ, bio, mirror_num, bio_flags); 4023 submit_one_bio(READ, bio, mirror_num, bio_flags);
3448 4024
3449 if (ret || !wait) 4025 if (ret || wait != WAIT_COMPLETE)
3450 return ret; 4026 return ret;
3451 4027
3452 for (i = start_i; i < num_pages; i++) { 4028 for (i = start_i; i < num_pages; i++) {
@@ -3492,9 +4068,8 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3492 page = extent_buffer_page(eb, i); 4068 page = extent_buffer_page(eb, i);
3493 4069
3494 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4070 cur = min(len, (PAGE_CACHE_SIZE - offset));
3495 kaddr = kmap_atomic(page, KM_USER1); 4071 kaddr = page_address(page);
3496 memcpy(dst, kaddr + offset, cur); 4072 memcpy(dst, kaddr + offset, cur);
3497 kunmap_atomic(kaddr, KM_USER1);
3498 4073
3499 dst += cur; 4074 dst += cur;
3500 len -= cur; 4075 len -= cur;
@@ -3504,9 +4079,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3504} 4079}
3505 4080
3506int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 4081int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3507 unsigned long min_len, char **token, char **map, 4082 unsigned long min_len, char **map,
3508 unsigned long *map_start, 4083 unsigned long *map_start,
3509 unsigned long *map_len, int km) 4084 unsigned long *map_len)
3510{ 4085{
3511 size_t offset = start & (PAGE_CACHE_SIZE - 1); 4086 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3512 char *kaddr; 4087 char *kaddr;
@@ -3536,42 +4111,12 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3536 } 4111 }
3537 4112
3538 p = extent_buffer_page(eb, i); 4113 p = extent_buffer_page(eb, i);
3539 kaddr = kmap_atomic(p, km); 4114 kaddr = page_address(p);
3540 *token = kaddr;
3541 *map = kaddr + offset; 4115 *map = kaddr + offset;
3542 *map_len = PAGE_CACHE_SIZE - offset; 4116 *map_len = PAGE_CACHE_SIZE - offset;
3543 return 0; 4117 return 0;
3544} 4118}
3545 4119
3546int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3547 unsigned long min_len,
3548 char **token, char **map,
3549 unsigned long *map_start,
3550 unsigned long *map_len, int km)
3551{
3552 int err;
3553 int save = 0;
3554 if (eb->map_token) {
3555 unmap_extent_buffer(eb, eb->map_token, km);
3556 eb->map_token = NULL;
3557 save = 1;
3558 }
3559 err = map_private_extent_buffer(eb, start, min_len, token, map,
3560 map_start, map_len, km);
3561 if (!err && save) {
3562 eb->map_token = *token;
3563 eb->kaddr = *map;
3564 eb->map_start = *map_start;
3565 eb->map_len = *map_len;
3566 }
3567 return err;
3568}
3569
3570void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3571{
3572 kunmap_atomic(token, km);
3573}
3574
3575int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 4120int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3576 unsigned long start, 4121 unsigned long start,
3577 unsigned long len) 4122 unsigned long len)
@@ -3595,9 +4140,8 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3595 4140
3596 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4141 cur = min(len, (PAGE_CACHE_SIZE - offset));
3597 4142
3598 kaddr = kmap_atomic(page, KM_USER0); 4143 kaddr = page_address(page);
3599 ret = memcmp(ptr, kaddr + offset, cur); 4144 ret = memcmp(ptr, kaddr + offset, cur);
3600 kunmap_atomic(kaddr, KM_USER0);
3601 if (ret) 4145 if (ret)
3602 break; 4146 break;
3603 4147
@@ -3630,9 +4174,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3630 WARN_ON(!PageUptodate(page)); 4174 WARN_ON(!PageUptodate(page));
3631 4175
3632 cur = min(len, PAGE_CACHE_SIZE - offset); 4176 cur = min(len, PAGE_CACHE_SIZE - offset);
3633 kaddr = kmap_atomic(page, KM_USER1); 4177 kaddr = page_address(page);
3634 memcpy(kaddr + offset, src, cur); 4178 memcpy(kaddr + offset, src, cur);
3635 kunmap_atomic(kaddr, KM_USER1);
3636 4179
3637 src += cur; 4180 src += cur;
3638 len -= cur; 4181 len -= cur;
@@ -3661,9 +4204,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
3661 WARN_ON(!PageUptodate(page)); 4204 WARN_ON(!PageUptodate(page));
3662 4205
3663 cur = min(len, PAGE_CACHE_SIZE - offset); 4206 cur = min(len, PAGE_CACHE_SIZE - offset);
3664 kaddr = kmap_atomic(page, KM_USER0); 4207 kaddr = page_address(page);
3665 memset(kaddr + offset, c, cur); 4208 memset(kaddr + offset, c, cur);
3666 kunmap_atomic(kaddr, KM_USER0);
3667 4209
3668 len -= cur; 4210 len -= cur;
3669 offset = 0; 4211 offset = 0;
@@ -3694,9 +4236,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3694 4236
3695 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 4237 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3696 4238
3697 kaddr = kmap_atomic(page, KM_USER0); 4239 kaddr = page_address(page);
3698 read_extent_buffer(src, kaddr + offset, src_offset, cur); 4240 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3699 kunmap_atomic(kaddr, KM_USER0);
3700 4241
3701 src_offset += cur; 4242 src_offset += cur;
3702 len -= cur; 4243 len -= cur;
@@ -3709,20 +4250,17 @@ static void move_pages(struct page *dst_page, struct page *src_page,
3709 unsigned long dst_off, unsigned long src_off, 4250 unsigned long dst_off, unsigned long src_off,
3710 unsigned long len) 4251 unsigned long len)
3711{ 4252{
3712 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 4253 char *dst_kaddr = page_address(dst_page);
3713 if (dst_page == src_page) { 4254 if (dst_page == src_page) {
3714 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 4255 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3715 } else { 4256 } else {
3716 char *src_kaddr = kmap_atomic(src_page, KM_USER1); 4257 char *src_kaddr = page_address(src_page);
3717 char *p = dst_kaddr + dst_off + len; 4258 char *p = dst_kaddr + dst_off + len;
3718 char *s = src_kaddr + src_off + len; 4259 char *s = src_kaddr + src_off + len;
3719 4260
3720 while (len--) 4261 while (len--)
3721 *--p = *--s; 4262 *--p = *--s;
3722
3723 kunmap_atomic(src_kaddr, KM_USER1);
3724 } 4263 }
3725 kunmap_atomic(dst_kaddr, KM_USER0);
3726} 4264}
3727 4265
3728static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 4266static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
@@ -3735,20 +4273,17 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
3735 unsigned long dst_off, unsigned long src_off, 4273 unsigned long dst_off, unsigned long src_off,
3736 unsigned long len) 4274 unsigned long len)
3737{ 4275{
3738 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 4276 char *dst_kaddr = page_address(dst_page);
3739 char *src_kaddr; 4277 char *src_kaddr;
3740 4278
3741 if (dst_page != src_page) { 4279 if (dst_page != src_page) {
3742 src_kaddr = kmap_atomic(src_page, KM_USER1); 4280 src_kaddr = page_address(src_page);
3743 } else { 4281 } else {
3744 src_kaddr = dst_kaddr; 4282 src_kaddr = dst_kaddr;
3745 BUG_ON(areas_overlap(src_off, dst_off, len)); 4283 BUG_ON(areas_overlap(src_off, dst_off, len));
3746 } 4284 }
3747 4285
3748 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 4286 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3749 kunmap_atomic(dst_kaddr, KM_USER0);
3750 if (dst_page != src_page)
3751 kunmap_atomic(src_kaddr, KM_USER1);
3752} 4287}
3753 4288
3754void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 4289void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,