aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-12 14:15:23 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-12 14:15:23 -0500
commitbdeb03cada1c305346505c48e5b1dab37e9acc4e (patch)
treeecbfda926e8b5b621f37150d509f176886ac0d82 /fs/btrfs
parent0349678ccd74d16c1f2bb58ecafec13ef7110e36 (diff)
parent9627aeee3e203e30679549e4962633698a6bf87f (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs update from Chris Mason: "From a feature point of view, most of the code here comes from Miao Xie and others at Fujitsu to implement scrubbing and replacing devices on raid56. This has been in development for a while, and it's a big improvement. Filipe and Josef have a great assortment of fixes, many of which solve problems corruptions either after a crash or in error conditions. I still have a round two from Filipe for next week that solves corruptions with discard and block group removal" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (62 commits) Btrfs: make get_caching_control unconditionally return the ctl Btrfs: fix unprotected deletion from pending_chunks list Btrfs: fix fs mapping extent map leak Btrfs: fix memory leak after block remove + trimming Btrfs: make btrfs_abort_transaction consider existence of new block groups Btrfs: fix race between writing free space cache and trimming Btrfs: fix race between fs trimming and block group remove/allocation Btrfs, replace: enable dev-replace for raid56 Btrfs: fix freeing used extents after removing empty block group Btrfs: fix crash caused by block group removal Btrfs: fix invalid block group rbtree access after bg is removed Btrfs, raid56: fix use-after-free problem in the final device replace procedure on raid56 Btrfs, replace: write raid56 parity into the replace target device Btrfs, replace: write dirty pages into the replace target device Btrfs, raid56: support parity scrub on raid56 Btrfs, raid56: use a variant to record the operation type Btrfs, scrub: repair the common data on RAID5/6 if it is corrupted Btrfs, raid56: don't change bbio and raid_map Btrfs: remove unnecessary code of stripe_index assignment in __btrfs_map_block Btrfs: remove noused bbio_ret in __btrfs_map_block in condition ...
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/check-integrity.c163
-rw-r--r--fs/btrfs/compression.c18
-rw-r--r--fs/btrfs/ctree.c2
-rw-r--r--fs/btrfs/ctree.h85
-rw-r--r--fs/btrfs/dev-replace.c32
-rw-r--r--fs/btrfs/dir-item.c10
-rw-r--r--fs/btrfs/disk-io.c49
-rw-r--r--fs/btrfs/extent-tree.c211
-rw-r--r--fs/btrfs/extent_io.c41
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file.c51
-rw-r--r--fs/btrfs/free-space-cache.c117
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c152
-rw-r--r--fs/btrfs/ioctl.c36
-rw-r--r--fs/btrfs/ordered-data.c49
-rw-r--r--fs/btrfs/ordered-data.h12
-rw-r--r--fs/btrfs/raid56.c763
-rw-r--r--fs/btrfs/raid56.h16
-rw-r--r--fs/btrfs/scrub.c893
-rw-r--r--fs/btrfs/send.c49
-rw-r--r--fs/btrfs/super.c94
-rw-r--r--fs/btrfs/sysfs.c34
-rw-r--r--fs/btrfs/transaction.c166
-rw-r--r--fs/btrfs/transaction.h6
-rw-r--r--fs/btrfs/tree-log.c50
-rw-r--r--fs/btrfs/volumes.c90
-rw-r--r--fs/btrfs/volumes.h32
-rw-r--r--fs/btrfs/xattr.c150
31 files changed, 2739 insertions, 641 deletions
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index cb7f3fe9c9f6..d897ef803b3b 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -94,6 +94,7 @@
94#include <linux/mutex.h> 94#include <linux/mutex.h>
95#include <linux/genhd.h> 95#include <linux/genhd.h>
96#include <linux/blkdev.h> 96#include <linux/blkdev.h>
97#include <linux/vmalloc.h>
97#include "ctree.h" 98#include "ctree.h"
98#include "disk-io.h" 99#include "disk-io.h"
99#include "hash.h" 100#include "hash.h"
@@ -326,9 +327,6 @@ static int btrfsic_handle_extent_data(struct btrfsic_state *state,
326static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, 327static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
327 struct btrfsic_block_data_ctx *block_ctx_out, 328 struct btrfsic_block_data_ctx *block_ctx_out,
328 int mirror_num); 329 int mirror_num);
329static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
330 u32 len, struct block_device *bdev,
331 struct btrfsic_block_data_ctx *block_ctx_out);
332static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); 330static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
333static int btrfsic_read_block(struct btrfsic_state *state, 331static int btrfsic_read_block(struct btrfsic_state *state,
334 struct btrfsic_block_data_ctx *block_ctx); 332 struct btrfsic_block_data_ctx *block_ctx);
@@ -1326,24 +1324,25 @@ static int btrfsic_create_link_to_next_block(
1326 l = NULL; 1324 l = NULL;
1327 next_block->generation = BTRFSIC_GENERATION_UNKNOWN; 1325 next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
1328 } else { 1326 } else {
1329 if (next_block->logical_bytenr != next_bytenr && 1327 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
1330 !(!next_block->is_metadata && 1328 if (next_block->logical_bytenr != next_bytenr &&
1331 0 == next_block->logical_bytenr)) { 1329 !(!next_block->is_metadata &&
1332 printk(KERN_INFO 1330 0 == next_block->logical_bytenr))
1333 "Referenced block @%llu (%s/%llu/%d)" 1331 printk(KERN_INFO
1334 " found in hash table, %c," 1332 "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
1335 " bytenr mismatch (!= stored %llu).\n", 1333 next_bytenr, next_block_ctx->dev->name,
1336 next_bytenr, next_block_ctx->dev->name, 1334 next_block_ctx->dev_bytenr, *mirror_nump,
1337 next_block_ctx->dev_bytenr, *mirror_nump, 1335 btrfsic_get_block_type(state,
1338 btrfsic_get_block_type(state, next_block), 1336 next_block),
1339 next_block->logical_bytenr); 1337 next_block->logical_bytenr);
1340 } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 1338 else
1341 printk(KERN_INFO 1339 printk(KERN_INFO
1342 "Referenced block @%llu (%s/%llu/%d)" 1340 "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
1343 " found in hash table, %c.\n", 1341 next_bytenr, next_block_ctx->dev->name,
1344 next_bytenr, next_block_ctx->dev->name, 1342 next_block_ctx->dev_bytenr, *mirror_nump,
1345 next_block_ctx->dev_bytenr, *mirror_nump, 1343 btrfsic_get_block_type(state,
1346 btrfsic_get_block_type(state, next_block)); 1344 next_block));
1345 }
1347 next_block->logical_bytenr = next_bytenr; 1346 next_block->logical_bytenr = next_bytenr;
1348 1347
1349 next_block->mirror_num = *mirror_nump; 1348 next_block->mirror_num = *mirror_nump;
@@ -1529,7 +1528,9 @@ static int btrfsic_handle_extent_data(
1529 return -1; 1528 return -1;
1530 } 1529 }
1531 if (!block_was_created) { 1530 if (!block_was_created) {
1532 if (next_block->logical_bytenr != next_bytenr && 1531 if ((state->print_mask &
1532 BTRFSIC_PRINT_MASK_VERBOSE) &&
1533 next_block->logical_bytenr != next_bytenr &&
1533 !(!next_block->is_metadata && 1534 !(!next_block->is_metadata &&
1534 0 == next_block->logical_bytenr)) { 1535 0 == next_block->logical_bytenr)) {
1535 printk(KERN_INFO 1536 printk(KERN_INFO
@@ -1607,25 +1608,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1607 return ret; 1608 return ret;
1608} 1609}
1609 1610
1610static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
1611 u32 len, struct block_device *bdev,
1612 struct btrfsic_block_data_ctx *block_ctx_out)
1613{
1614 block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
1615 block_ctx_out->dev_bytenr = bytenr;
1616 block_ctx_out->start = bytenr;
1617 block_ctx_out->len = len;
1618 block_ctx_out->datav = NULL;
1619 block_ctx_out->pagev = NULL;
1620 block_ctx_out->mem_to_free = NULL;
1621 if (NULL != block_ctx_out->dev) {
1622 return 0;
1623 } else {
1624 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
1625 return -ENXIO;
1626 }
1627}
1628
1629static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) 1611static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
1630{ 1612{
1631 if (block_ctx->mem_to_free) { 1613 if (block_ctx->mem_to_free) {
@@ -1901,25 +1883,26 @@ again:
1901 dev_state, 1883 dev_state,
1902 dev_bytenr); 1884 dev_bytenr);
1903 } 1885 }
1904 if (block->logical_bytenr != bytenr && 1886 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
1905 !(!block->is_metadata && 1887 if (block->logical_bytenr != bytenr &&
1906 block->logical_bytenr == 0)) 1888 !(!block->is_metadata &&
1907 printk(KERN_INFO 1889 block->logical_bytenr == 0))
1908 "Written block @%llu (%s/%llu/%d)" 1890 printk(KERN_INFO
1909 " found in hash table, %c," 1891 "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
1910 " bytenr mismatch" 1892 bytenr, dev_state->name,
1911 " (!= stored %llu).\n", 1893 dev_bytenr,
1912 bytenr, dev_state->name, dev_bytenr, 1894 block->mirror_num,
1913 block->mirror_num, 1895 btrfsic_get_block_type(state,
1914 btrfsic_get_block_type(state, block), 1896 block),
1915 block->logical_bytenr); 1897 block->logical_bytenr);
1916 else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 1898 else
1917 printk(KERN_INFO 1899 printk(KERN_INFO
1918 "Written block @%llu (%s/%llu/%d)" 1900 "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
1919 " found in hash table, %c.\n", 1901 bytenr, dev_state->name,
1920 bytenr, dev_state->name, dev_bytenr, 1902 dev_bytenr, block->mirror_num,
1921 block->mirror_num, 1903 btrfsic_get_block_type(state,
1922 btrfsic_get_block_type(state, block)); 1904 block));
1905 }
1923 block->logical_bytenr = bytenr; 1906 block->logical_bytenr = bytenr;
1924 } else { 1907 } else {
1925 if (num_pages * PAGE_CACHE_SIZE < 1908 if (num_pages * PAGE_CACHE_SIZE <
@@ -2002,24 +1985,13 @@ again:
2002 } 1985 }
2003 } 1986 }
2004 1987
2005 if (block->is_superblock)
2006 ret = btrfsic_map_superblock(state, bytenr,
2007 processed_len,
2008 bdev, &block_ctx);
2009 else
2010 ret = btrfsic_map_block(state, bytenr, processed_len,
2011 &block_ctx, 0);
2012 if (ret) {
2013 printk(KERN_INFO
2014 "btrfsic: btrfsic_map_block(root @%llu)"
2015 " failed!\n", bytenr);
2016 goto continue_loop;
2017 }
2018 block_ctx.datav = mapped_datav;
2019 /* the following is required in case of writes to mirrors,
2020 * use the same that was used for the lookup */
2021 block_ctx.dev = dev_state; 1988 block_ctx.dev = dev_state;
2022 block_ctx.dev_bytenr = dev_bytenr; 1989 block_ctx.dev_bytenr = dev_bytenr;
1990 block_ctx.start = bytenr;
1991 block_ctx.len = processed_len;
1992 block_ctx.pagev = NULL;
1993 block_ctx.mem_to_free = NULL;
1994 block_ctx.datav = mapped_datav;
2023 1995
2024 if (is_metadata || state->include_extent_data) { 1996 if (is_metadata || state->include_extent_data) {
2025 block->never_written = 0; 1997 block->never_written = 0;
@@ -2133,10 +2105,6 @@ again:
2133 /* this is getting ugly for the 2105 /* this is getting ugly for the
2134 * include_extent_data case... */ 2106 * include_extent_data case... */
2135 bytenr = 0; /* unknown */ 2107 bytenr = 0; /* unknown */
2136 block_ctx.start = bytenr;
2137 block_ctx.len = processed_len;
2138 block_ctx.mem_to_free = NULL;
2139 block_ctx.pagev = NULL;
2140 } else { 2108 } else {
2141 processed_len = state->metablock_size; 2109 processed_len = state->metablock_size;
2142 bytenr = btrfs_stack_header_bytenr( 2110 bytenr = btrfs_stack_header_bytenr(
@@ -2149,22 +2117,15 @@ again:
2149 "Written block @%llu (%s/%llu/?)" 2117 "Written block @%llu (%s/%llu/?)"
2150 " !found in hash table, M.\n", 2118 " !found in hash table, M.\n",
2151 bytenr, dev_state->name, dev_bytenr); 2119 bytenr, dev_state->name, dev_bytenr);
2152
2153 ret = btrfsic_map_block(state, bytenr, processed_len,
2154 &block_ctx, 0);
2155 if (ret) {
2156 printk(KERN_INFO
2157 "btrfsic: btrfsic_map_block(root @%llu)"
2158 " failed!\n",
2159 dev_bytenr);
2160 goto continue_loop;
2161 }
2162 } 2120 }
2163 block_ctx.datav = mapped_datav; 2121
2164 /* the following is required in case of writes to mirrors,
2165 * use the same that was used for the lookup */
2166 block_ctx.dev = dev_state; 2122 block_ctx.dev = dev_state;
2167 block_ctx.dev_bytenr = dev_bytenr; 2123 block_ctx.dev_bytenr = dev_bytenr;
2124 block_ctx.start = bytenr;
2125 block_ctx.len = processed_len;
2126 block_ctx.pagev = NULL;
2127 block_ctx.mem_to_free = NULL;
2128 block_ctx.datav = mapped_datav;
2168 2129
2169 block = btrfsic_block_alloc(); 2130 block = btrfsic_block_alloc();
2170 if (NULL == block) { 2131 if (NULL == block) {
@@ -3130,10 +3091,13 @@ int btrfsic_mount(struct btrfs_root *root,
3130 root->sectorsize, PAGE_CACHE_SIZE); 3091 root->sectorsize, PAGE_CACHE_SIZE);
3131 return -1; 3092 return -1;
3132 } 3093 }
3133 state = kzalloc(sizeof(*state), GFP_NOFS); 3094 state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
3134 if (NULL == state) { 3095 if (!state) {
3135 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); 3096 state = vzalloc(sizeof(*state));
3136 return -1; 3097 if (!state) {
3098 printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n");
3099 return -1;
3100 }
3137 } 3101 }
3138 3102
3139 if (!btrfsic_is_initialized) { 3103 if (!btrfsic_is_initialized) {
@@ -3277,5 +3241,8 @@ void btrfsic_unmount(struct btrfs_root *root,
3277 3241
3278 mutex_unlock(&btrfsic_mutex); 3242 mutex_unlock(&btrfsic_mutex);
3279 3243
3280 kfree(state); 3244 if (is_vmalloc_addr(state))
3245 vfree(state);
3246 else
3247 kfree(state);
3281} 3248}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index dcd9be32ac57..e9df8862012c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -224,16 +224,19 @@ out:
224 * Clear the writeback bits on all of the file 224 * Clear the writeback bits on all of the file
225 * pages for a compressed write 225 * pages for a compressed write
226 */ 226 */
227static noinline void end_compressed_writeback(struct inode *inode, u64 start, 227static noinline void end_compressed_writeback(struct inode *inode,
228 unsigned long ram_size) 228 const struct compressed_bio *cb)
229{ 229{
230 unsigned long index = start >> PAGE_CACHE_SHIFT; 230 unsigned long index = cb->start >> PAGE_CACHE_SHIFT;
231 unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; 231 unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT;
232 struct page *pages[16]; 232 struct page *pages[16];
233 unsigned long nr_pages = end_index - index + 1; 233 unsigned long nr_pages = end_index - index + 1;
234 int i; 234 int i;
235 int ret; 235 int ret;
236 236
237 if (cb->errors)
238 mapping_set_error(inode->i_mapping, -EIO);
239
237 while (nr_pages > 0) { 240 while (nr_pages > 0) {
238 ret = find_get_pages_contig(inode->i_mapping, index, 241 ret = find_get_pages_contig(inode->i_mapping, index,
239 min_t(unsigned long, 242 min_t(unsigned long,
@@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start,
244 continue; 247 continue;
245 } 248 }
246 for (i = 0; i < ret; i++) { 249 for (i = 0; i < ret; i++) {
250 if (cb->errors)
251 SetPageError(pages[i]);
247 end_page_writeback(pages[i]); 252 end_page_writeback(pages[i]);
248 page_cache_release(pages[i]); 253 page_cache_release(pages[i]);
249 } 254 }
@@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int err)
287 tree->ops->writepage_end_io_hook(cb->compressed_pages[0], 292 tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
288 cb->start, 293 cb->start,
289 cb->start + cb->len - 1, 294 cb->start + cb->len - 1,
290 NULL, 1); 295 NULL,
296 err ? 0 : 1);
291 cb->compressed_pages[0]->mapping = NULL; 297 cb->compressed_pages[0]->mapping = NULL;
292 298
293 end_compressed_writeback(inode, cb->start, cb->len); 299 end_compressed_writeback(inode, cb);
294 /* note, our inode could be gone now */ 300 /* note, our inode could be gone now */
295 301
296 /* 302 /*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 150822ee0a0b..14a72ed14ef7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2929,7 +2929,7 @@ done:
2929 */ 2929 */
2930 if (!p->leave_spinning) 2930 if (!p->leave_spinning)
2931 btrfs_set_path_blocking(p); 2931 btrfs_set_path_blocking(p);
2932 if (ret < 0) 2932 if (ret < 0 && !p->skip_release_on_error)
2933 btrfs_release_path(p); 2933 btrfs_release_path(p);
2934 return ret; 2934 return ret;
2935} 2935}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fe69edda11fb..e6fbbd74b716 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -607,6 +607,7 @@ struct btrfs_path {
607 unsigned int leave_spinning:1; 607 unsigned int leave_spinning:1;
608 unsigned int search_commit_root:1; 608 unsigned int search_commit_root:1;
609 unsigned int need_commit_sem:1; 609 unsigned int need_commit_sem:1;
610 unsigned int skip_release_on_error:1;
610}; 611};
611 612
612/* 613/*
@@ -1170,6 +1171,7 @@ struct btrfs_space_info {
1170 struct percpu_counter total_bytes_pinned; 1171 struct percpu_counter total_bytes_pinned;
1171 1172
1172 struct list_head list; 1173 struct list_head list;
1174 struct list_head ro_bgs;
1173 1175
1174 struct rw_semaphore groups_sem; 1176 struct rw_semaphore groups_sem;
1175 /* for block groups in our same type */ 1177 /* for block groups in our same type */
@@ -1276,6 +1278,8 @@ struct btrfs_block_group_cache {
1276 unsigned int ro:1; 1278 unsigned int ro:1;
1277 unsigned int dirty:1; 1279 unsigned int dirty:1;
1278 unsigned int iref:1; 1280 unsigned int iref:1;
1281 unsigned int has_caching_ctl:1;
1282 unsigned int removed:1;
1279 1283
1280 int disk_cache_state; 1284 int disk_cache_state;
1281 1285
@@ -1305,6 +1309,11 @@ struct btrfs_block_group_cache {
1305 1309
1306 /* For delayed block group creation or deletion of empty block groups */ 1310 /* For delayed block group creation or deletion of empty block groups */
1307 struct list_head bg_list; 1311 struct list_head bg_list;
1312
1313 /* For read-only block groups */
1314 struct list_head ro_list;
1315
1316 atomic_t trimming;
1308}; 1317};
1309 1318
1310/* delayed seq elem */ 1319/* delayed seq elem */
@@ -1402,6 +1411,11 @@ struct btrfs_fs_info {
1402 */ 1411 */
1403 u64 last_trans_log_full_commit; 1412 u64 last_trans_log_full_commit;
1404 unsigned long mount_opt; 1413 unsigned long mount_opt;
1414 /*
1415 * Track requests for actions that need to be done during transaction
1416 * commit (like for some mount options).
1417 */
1418 unsigned long pending_changes;
1405 unsigned long compress_type:4; 1419 unsigned long compress_type:4;
1406 int commit_interval; 1420 int commit_interval;
1407 /* 1421 /*
@@ -1729,6 +1743,12 @@ struct btrfs_fs_info {
1729 1743
1730 /* For btrfs to record security options */ 1744 /* For btrfs to record security options */
1731 struct security_mnt_opts security_opts; 1745 struct security_mnt_opts security_opts;
1746
1747 /*
1748 * Chunks that can't be freed yet (under a trim/discard operation)
1749 * and will be latter freed. Protected by fs_info->chunk_mutex.
1750 */
1751 struct list_head pinned_chunks;
1732}; 1752};
1733 1753
1734struct btrfs_subvolume_writers { 1754struct btrfs_subvolume_writers {
@@ -2093,7 +2113,6 @@ struct btrfs_ioctl_defrag_range_args {
2093#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) 2113#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
2094#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) 2114#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
2095#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) 2115#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
2096#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24)
2097 2116
2098#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) 2117#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
2099#define BTRFS_DEFAULT_MAX_INLINE (8192) 2118#define BTRFS_DEFAULT_MAX_INLINE (8192)
@@ -2103,6 +2122,7 @@ struct btrfs_ioctl_defrag_range_args {
2103#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) 2122#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
2104#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ 2123#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
2105 BTRFS_MOUNT_##opt) 2124 BTRFS_MOUNT_##opt)
2125
2106#define btrfs_set_and_info(root, opt, fmt, args...) \ 2126#define btrfs_set_and_info(root, opt, fmt, args...) \
2107{ \ 2127{ \
2108 if (!btrfs_test_opt(root, opt)) \ 2128 if (!btrfs_test_opt(root, opt)) \
@@ -2118,6 +2138,49 @@ struct btrfs_ioctl_defrag_range_args {
2118} 2138}
2119 2139
2120/* 2140/*
2141 * Requests for changes that need to be done during transaction commit.
2142 *
2143 * Internal mount options that are used for special handling of the real
2144 * mount options (eg. cannot be set during remount and have to be set during
2145 * transaction commit)
2146 */
2147
2148#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0)
2149#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1)
2150#define BTRFS_PENDING_COMMIT (2)
2151
2152#define btrfs_test_pending(info, opt) \
2153 test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
2154#define btrfs_set_pending(info, opt) \
2155 set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
2156#define btrfs_clear_pending(info, opt) \
2157 clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
2158
2159/*
2160 * Helpers for setting pending mount option changes.
2161 *
2162 * Expects corresponding macros
2163 * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name
2164 */
2165#define btrfs_set_pending_and_info(info, opt, fmt, args...) \
2166do { \
2167 if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \
2168 btrfs_info((info), fmt, ##args); \
2169 btrfs_set_pending((info), SET_##opt); \
2170 btrfs_clear_pending((info), CLEAR_##opt); \
2171 } \
2172} while(0)
2173
2174#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \
2175do { \
2176 if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \
2177 btrfs_info((info), fmt, ##args); \
2178 btrfs_set_pending((info), CLEAR_##opt); \
2179 btrfs_clear_pending((info), SET_##opt); \
2180 } \
2181} while(0)
2182
2183/*
2121 * Inode flags 2184 * Inode flags
2122 */ 2185 */
2123#define BTRFS_INODE_NODATASUM (1 << 0) 2186#define BTRFS_INODE_NODATASUM (1 << 0)
@@ -3351,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
3351 u64 type, u64 chunk_objectid, u64 chunk_offset, 3414 u64 type, u64 chunk_objectid, u64 chunk_offset,
3352 u64 size); 3415 u64 size);
3353int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 3416int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
3354 struct btrfs_root *root, u64 group_start); 3417 struct btrfs_root *root, u64 group_start,
3418 struct extent_map *em);
3355void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); 3419void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
3356void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 3420void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
3357 struct btrfs_root *root); 3421 struct btrfs_root *root);
@@ -3427,8 +3491,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
3427int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3491int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
3428 struct btrfs_fs_info *fs_info); 3492 struct btrfs_fs_info *fs_info);
3429int __get_raid_index(u64 flags); 3493int __get_raid_index(u64 flags);
3430int btrfs_start_nocow_write(struct btrfs_root *root); 3494int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
3431void btrfs_end_nocow_write(struct btrfs_root *root); 3495void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
3432/* ctree.c */ 3496/* ctree.c */
3433int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3497int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
3434 int level, int *slot); 3498 int level, int *slot);
@@ -3686,6 +3750,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
3686int verify_dir_item(struct btrfs_root *root, 3750int verify_dir_item(struct btrfs_root *root,
3687 struct extent_buffer *leaf, 3751 struct extent_buffer *leaf,
3688 struct btrfs_dir_item *dir_item); 3752 struct btrfs_dir_item *dir_item);
3753struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
3754 struct btrfs_path *path,
3755 const char *name,
3756 int name_len);
3689 3757
3690/* orphan.c */ 3758/* orphan.c */
3691int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, 3759int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -3857,6 +3925,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
3857 struct btrfs_trans_handle *trans, int mode, 3925 struct btrfs_trans_handle *trans, int mode,
3858 u64 start, u64 num_bytes, u64 min_size, 3926 u64 start, u64 num_bytes, u64 min_size,
3859 loff_t actual_len, u64 *alloc_hint); 3927 loff_t actual_len, u64 *alloc_hint);
3928int btrfs_inode_check_errors(struct inode *inode);
3860extern const struct dentry_operations btrfs_dentry_operations; 3929extern const struct dentry_operations btrfs_dentry_operations;
3861 3930
3862/* ioctl.c */ 3931/* ioctl.c */
@@ -3901,6 +3970,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
3901 struct page **pages, size_t num_pages, 3970 struct page **pages, size_t num_pages,
3902 loff_t pos, size_t write_bytes, 3971 loff_t pos, size_t write_bytes,
3903 struct extent_state **cached); 3972 struct extent_state **cached);
3973int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
3904 3974
3905/* tree-defrag.c */ 3975/* tree-defrag.c */
3906int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 3976int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -4097,7 +4167,12 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
4097/* dev-replace.c */ 4167/* dev-replace.c */
4098void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); 4168void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
4099void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); 4169void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
4100void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info); 4170void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
4171
4172static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
4173{
4174 btrfs_bio_counter_sub(fs_info, 1);
4175}
4101 4176
4102/* reada.c */ 4177/* reada.c */
4103struct reada_control { 4178struct reada_control {
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 6f662b34ba0e..ca6a3a3b6b6c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -316,11 +316,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
316 struct btrfs_device *tgt_device = NULL; 316 struct btrfs_device *tgt_device = NULL;
317 struct btrfs_device *src_device = NULL; 317 struct btrfs_device *src_device = NULL;
318 318
319 if (btrfs_fs_incompat(fs_info, RAID56)) {
320 btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
321 return -EOPNOTSUPP;
322 }
323
324 switch (args->start.cont_reading_from_srcdev_mode) { 319 switch (args->start.cont_reading_from_srcdev_mode) {
325 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: 320 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
326 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: 321 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
@@ -422,9 +417,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
422 &dev_replace->scrub_progress, 0, 1); 417 &dev_replace->scrub_progress, 0, 1);
423 418
424 ret = btrfs_dev_replace_finishing(root->fs_info, ret); 419 ret = btrfs_dev_replace_finishing(root->fs_info, ret);
425 WARN_ON(ret); 420 /* don't warn if EINPROGRESS, someone else might be running scrub */
421 if (ret == -EINPROGRESS) {
422 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
423 ret = 0;
424 } else {
425 WARN_ON(ret);
426 }
426 427
427 return 0; 428 return ret;
428 429
429leave: 430leave:
430 dev_replace->srcdev = NULL; 431 dev_replace->srcdev = NULL;
@@ -542,7 +543,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
542 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 543 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
543 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 544 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
544 545
545 return 0; 546 return scrub_ret;
546 } 547 }
547 548
548 printk_in_rcu(KERN_INFO 549 printk_in_rcu(KERN_INFO
@@ -571,15 +572,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
571 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 572 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
572 fs_info->fs_devices->rw_devices++; 573 fs_info->fs_devices->rw_devices++;
573 574
574 /* replace the sysfs entry */
575 btrfs_kobj_rm_device(fs_info, src_device);
576 btrfs_kobj_add_device(fs_info, tgt_device);
577
578 btrfs_dev_replace_unlock(dev_replace); 575 btrfs_dev_replace_unlock(dev_replace);
579 576
580 btrfs_rm_dev_replace_blocked(fs_info); 577 btrfs_rm_dev_replace_blocked(fs_info);
581 578
582 btrfs_rm_dev_replace_srcdev(fs_info, src_device); 579 btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
583 580
584 btrfs_rm_dev_replace_unblocked(fs_info); 581 btrfs_rm_dev_replace_unblocked(fs_info);
585 582
@@ -594,6 +591,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
594 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 591 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
595 mutex_unlock(&uuid_mutex); 592 mutex_unlock(&uuid_mutex);
596 593
594 /* replace the sysfs entry */
595 btrfs_kobj_rm_device(fs_info, src_device);
596 btrfs_kobj_add_device(fs_info, tgt_device);
597 btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
598
597 /* write back the superblocks */ 599 /* write back the superblocks */
598 trans = btrfs_start_transaction(root, 0); 600 trans = btrfs_start_transaction(root, 0);
599 if (!IS_ERR(trans)) 601 if (!IS_ERR(trans))
@@ -920,9 +922,9 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
920 percpu_counter_inc(&fs_info->bio_counter); 922 percpu_counter_inc(&fs_info->bio_counter);
921} 923}
922 924
923void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) 925void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
924{ 926{
925 percpu_counter_dec(&fs_info->bio_counter); 927 percpu_counter_sub(&fs_info->bio_counter, amount);
926 928
927 if (waitqueue_active(&fs_info->replace_wait)) 929 if (waitqueue_active(&fs_info->replace_wait))
928 wake_up(&fs_info->replace_wait); 930 wake_up(&fs_info->replace_wait);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index fc8df866e919..1752625fb4dd 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,10 +21,6 @@
21#include "hash.h" 21#include "hash.h"
22#include "transaction.h" 22#include "transaction.h"
23 23
24static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
25 struct btrfs_path *path,
26 const char *name, int name_len);
27
28/* 24/*
29 * insert a name into a directory, doing overflow properly if there is a hash 25 * insert a name into a directory, doing overflow properly if there is a hash
30 * collision. data_size indicates how big the item inserted should be. On 26 * collision. data_size indicates how big the item inserted should be. On
@@ -383,9 +379,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
383 * this walks through all the entries in a dir item and finds one 379 * this walks through all the entries in a dir item and finds one
384 * for a specific name. 380 * for a specific name.
385 */ 381 */
386static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, 382struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
387 struct btrfs_path *path, 383 struct btrfs_path *path,
388 const char *name, int name_len) 384 const char *name, int name_len)
389{ 385{
390 struct btrfs_dir_item *dir_item; 386 struct btrfs_dir_item *dir_item;
391 unsigned long name_ptr; 387 unsigned long name_ptr;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1bf9f897065d..30965120772b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb,
2384 init_waitqueue_head(&fs_info->transaction_blocked_wait); 2384 init_waitqueue_head(&fs_info->transaction_blocked_wait);
2385 init_waitqueue_head(&fs_info->async_submit_wait); 2385 init_waitqueue_head(&fs_info->async_submit_wait);
2386 2386
2387 INIT_LIST_HEAD(&fs_info->pinned_chunks);
2388
2387 ret = btrfs_alloc_stripe_hash_table(fs_info); 2389 ret = btrfs_alloc_stripe_hash_table(fs_info);
2388 if (ret) { 2390 if (ret) {
2389 err = ret; 2391 err = ret;
@@ -2830,9 +2832,11 @@ retry_root_backup:
2830 btrfs_set_opt(fs_info->mount_opt, SSD); 2832 btrfs_set_opt(fs_info->mount_opt, SSD);
2831 } 2833 }
2832 2834
2833 /* Set the real inode map cache flag */ 2835 /*
2834 if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE)) 2836 * Mount does not set all options immediatelly, we can do it now and do
2835 btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE); 2837 * not have to wait for transaction commit
2838 */
2839 btrfs_apply_pending_changes(fs_info);
2836 2840
2837#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 2841#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2838 if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { 2842 if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
@@ -3713,6 +3717,17 @@ void close_ctree(struct btrfs_root *root)
3713 3717
3714 btrfs_free_block_rsv(root, root->orphan_block_rsv); 3718 btrfs_free_block_rsv(root, root->orphan_block_rsv);
3715 root->orphan_block_rsv = NULL; 3719 root->orphan_block_rsv = NULL;
3720
3721 lock_chunks(root);
3722 while (!list_empty(&fs_info->pinned_chunks)) {
3723 struct extent_map *em;
3724
3725 em = list_first_entry(&fs_info->pinned_chunks,
3726 struct extent_map, list);
3727 list_del_init(&em->list);
3728 free_extent_map(em);
3729 }
3730 unlock_chunks(root);
3716} 3731}
3717 3732
3718int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 3733int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -3839,12 +3854,12 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3839 */ 3854 */
3840 if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) 3855 if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
3841 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", 3856 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3842 sb->root); 3857 btrfs_super_root(sb));
3843 if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) 3858 if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
3844 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", 3859 printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
3845 sb->chunk_root); 3860 btrfs_super_chunk_root(sb));
3846 if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) 3861 if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
3847 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", 3862 printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
3848 btrfs_super_log_root(sb)); 3863 btrfs_super_log_root(sb));
3849 3864
3850 if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { 3865 if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
@@ -4129,6 +4144,25 @@ again:
4129 return 0; 4144 return 0;
4130} 4145}
4131 4146
4147static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
4148 struct btrfs_fs_info *fs_info)
4149{
4150 struct btrfs_ordered_extent *ordered;
4151
4152 spin_lock(&fs_info->trans_lock);
4153 while (!list_empty(&cur_trans->pending_ordered)) {
4154 ordered = list_first_entry(&cur_trans->pending_ordered,
4155 struct btrfs_ordered_extent,
4156 trans_list);
4157 list_del_init(&ordered->trans_list);
4158 spin_unlock(&fs_info->trans_lock);
4159
4160 btrfs_put_ordered_extent(ordered);
4161 spin_lock(&fs_info->trans_lock);
4162 }
4163 spin_unlock(&fs_info->trans_lock);
4164}
4165
4132void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, 4166void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4133 struct btrfs_root *root) 4167 struct btrfs_root *root)
4134{ 4168{
@@ -4140,6 +4174,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4140 cur_trans->state = TRANS_STATE_UNBLOCKED; 4174 cur_trans->state = TRANS_STATE_UNBLOCKED;
4141 wake_up(&root->fs_info->transaction_wait); 4175 wake_up(&root->fs_info->transaction_wait);
4142 4176
4177 btrfs_free_pending_ordered(cur_trans, root->fs_info);
4143 btrfs_destroy_delayed_inodes(root); 4178 btrfs_destroy_delayed_inodes(root);
4144 btrfs_assert_delayed_root_empty(root); 4179 btrfs_assert_delayed_root_empty(root);
4145 4180
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 47c1ba141082..222d6aea4a8a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache)
315 struct btrfs_caching_control *ctl; 315 struct btrfs_caching_control *ctl;
316 316
317 spin_lock(&cache->lock); 317 spin_lock(&cache->lock);
318 if (cache->cached != BTRFS_CACHE_STARTED) {
319 spin_unlock(&cache->lock);
320 return NULL;
321 }
322
323 /* We're loading it the fast way, so we don't have a caching_ctl. */
324 if (!cache->caching_ctl) { 318 if (!cache->caching_ctl) {
325 spin_unlock(&cache->lock); 319 spin_unlock(&cache->lock);
326 return NULL; 320 return NULL;
@@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
594 spin_unlock(&cache->lock); 588 spin_unlock(&cache->lock);
595 589
596 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 590 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
591 mutex_lock(&caching_ctl->mutex);
597 ret = load_free_space_cache(fs_info, cache); 592 ret = load_free_space_cache(fs_info, cache);
598 593
599 spin_lock(&cache->lock); 594 spin_lock(&cache->lock);
@@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
601 cache->caching_ctl = NULL; 596 cache->caching_ctl = NULL;
602 cache->cached = BTRFS_CACHE_FINISHED; 597 cache->cached = BTRFS_CACHE_FINISHED;
603 cache->last_byte_to_unpin = (u64)-1; 598 cache->last_byte_to_unpin = (u64)-1;
599 caching_ctl->progress = (u64)-1;
604 } else { 600 } else {
605 if (load_cache_only) { 601 if (load_cache_only) {
606 cache->caching_ctl = NULL; 602 cache->caching_ctl = NULL;
607 cache->cached = BTRFS_CACHE_NO; 603 cache->cached = BTRFS_CACHE_NO;
608 } else { 604 } else {
609 cache->cached = BTRFS_CACHE_STARTED; 605 cache->cached = BTRFS_CACHE_STARTED;
606 cache->has_caching_ctl = 1;
610 } 607 }
611 } 608 }
612 spin_unlock(&cache->lock); 609 spin_unlock(&cache->lock);
610 mutex_unlock(&caching_ctl->mutex);
611
613 wake_up(&caching_ctl->wait); 612 wake_up(&caching_ctl->wait);
614 if (ret == 1) { 613 if (ret == 1) {
615 put_caching_control(caching_ctl); 614 put_caching_control(caching_ctl);
@@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
627 cache->cached = BTRFS_CACHE_NO; 626 cache->cached = BTRFS_CACHE_NO;
628 } else { 627 } else {
629 cache->cached = BTRFS_CACHE_STARTED; 628 cache->cached = BTRFS_CACHE_STARTED;
629 cache->has_caching_ctl = 1;
630 } 630 }
631 spin_unlock(&cache->lock); 631 spin_unlock(&cache->lock);
632 wake_up(&caching_ctl->wait); 632 wake_up(&caching_ctl->wait);
@@ -3162,7 +3162,19 @@ next_block_group(struct btrfs_root *root,
3162 struct btrfs_block_group_cache *cache) 3162 struct btrfs_block_group_cache *cache)
3163{ 3163{
3164 struct rb_node *node; 3164 struct rb_node *node;
3165
3165 spin_lock(&root->fs_info->block_group_cache_lock); 3166 spin_lock(&root->fs_info->block_group_cache_lock);
3167
3168 /* If our block group was removed, we need a full search. */
3169 if (RB_EMPTY_NODE(&cache->cache_node)) {
3170 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3171
3172 spin_unlock(&root->fs_info->block_group_cache_lock);
3173 btrfs_put_block_group(cache);
3174 cache = btrfs_lookup_first_block_group(root->fs_info,
3175 next_bytenr);
3176 return cache;
3177 }
3166 node = rb_next(&cache->cache_node); 3178 node = rb_next(&cache->cache_node);
3167 btrfs_put_block_group(cache); 3179 btrfs_put_block_group(cache);
3168 if (node) { 3180 if (node) {
@@ -3504,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3504 found->chunk_alloc = 0; 3516 found->chunk_alloc = 0;
3505 found->flush = 0; 3517 found->flush = 0;
3506 init_waitqueue_head(&found->wait); 3518 init_waitqueue_head(&found->wait);
3519 INIT_LIST_HEAD(&found->ro_bgs);
3507 3520
3508 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3521 ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3509 info->space_info_kobj, "%s", 3522 info->space_info_kobj, "%s",
@@ -5425,7 +5438,17 @@ static int update_block_group(struct btrfs_root *root,
5425 spin_unlock(&cache->space_info->lock); 5438 spin_unlock(&cache->space_info->lock);
5426 } else { 5439 } else {
5427 old_val -= num_bytes; 5440 old_val -= num_bytes;
5441 btrfs_set_block_group_used(&cache->item, old_val);
5442 cache->pinned += num_bytes;
5443 cache->space_info->bytes_pinned += num_bytes;
5444 cache->space_info->bytes_used -= num_bytes;
5445 cache->space_info->disk_used -= num_bytes * factor;
5446 spin_unlock(&cache->lock);
5447 spin_unlock(&cache->space_info->lock);
5428 5448
5449 set_extent_dirty(info->pinned_extents,
5450 bytenr, bytenr + num_bytes - 1,
5451 GFP_NOFS | __GFP_NOFAIL);
5429 /* 5452 /*
5430 * No longer have used bytes in this block group, queue 5453 * No longer have used bytes in this block group, queue
5431 * it for deletion. 5454 * it for deletion.
@@ -5439,17 +5462,6 @@ static int update_block_group(struct btrfs_root *root,
5439 } 5462 }
5440 spin_unlock(&info->unused_bgs_lock); 5463 spin_unlock(&info->unused_bgs_lock);
5441 } 5464 }
5442 btrfs_set_block_group_used(&cache->item, old_val);
5443 cache->pinned += num_bytes;
5444 cache->space_info->bytes_pinned += num_bytes;
5445 cache->space_info->bytes_used -= num_bytes;
5446 cache->space_info->disk_used -= num_bytes * factor;
5447 spin_unlock(&cache->lock);
5448 spin_unlock(&cache->space_info->lock);
5449
5450 set_extent_dirty(info->pinned_extents,
5451 bytenr, bytenr + num_bytes - 1,
5452 GFP_NOFS | __GFP_NOFAIL);
5453 } 5465 }
5454 btrfs_put_block_group(cache); 5466 btrfs_put_block_group(cache);
5455 total -= num_bytes; 5467 total -= num_bytes;
@@ -8511,6 +8523,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
8511 min_allocable_bytes <= sinfo->total_bytes) { 8523 min_allocable_bytes <= sinfo->total_bytes) {
8512 sinfo->bytes_readonly += num_bytes; 8524 sinfo->bytes_readonly += num_bytes;
8513 cache->ro = 1; 8525 cache->ro = 1;
8526 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
8514 ret = 0; 8527 ret = 0;
8515 } 8528 }
8516out: 8529out:
@@ -8565,15 +8578,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
8565 8578
8566/* 8579/*
8567 * helper to account the unused space of all the readonly block group in the 8580 * helper to account the unused space of all the readonly block group in the
8568 * list. takes mirrors into account. 8581 * space_info. takes mirrors into account.
8569 */ 8582 */
8570static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) 8583u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8571{ 8584{
8572 struct btrfs_block_group_cache *block_group; 8585 struct btrfs_block_group_cache *block_group;
8573 u64 free_bytes = 0; 8586 u64 free_bytes = 0;
8574 int factor; 8587 int factor;
8575 8588
8576 list_for_each_entry(block_group, groups_list, list) { 8589 /* It's df, we don't care if it's racey */
8590 if (list_empty(&sinfo->ro_bgs))
8591 return 0;
8592
8593 spin_lock(&sinfo->lock);
8594 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
8577 spin_lock(&block_group->lock); 8595 spin_lock(&block_group->lock);
8578 8596
8579 if (!block_group->ro) { 8597 if (!block_group->ro) {
@@ -8594,26 +8612,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
8594 8612
8595 spin_unlock(&block_group->lock); 8613 spin_unlock(&block_group->lock);
8596 } 8614 }
8597
8598 return free_bytes;
8599}
8600
8601/*
8602 * helper to account the unused space of all the readonly block group in the
8603 * space_info. takes mirrors into account.
8604 */
8605u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8606{
8607 int i;
8608 u64 free_bytes = 0;
8609
8610 spin_lock(&sinfo->lock);
8611
8612 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
8613 if (!list_empty(&sinfo->block_groups[i]))
8614 free_bytes += __btrfs_get_ro_block_group_free_space(
8615 &sinfo->block_groups[i]);
8616
8617 spin_unlock(&sinfo->lock); 8615 spin_unlock(&sinfo->lock);
8618 8616
8619 return free_bytes; 8617 return free_bytes;
@@ -8633,6 +8631,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
8633 cache->bytes_super - btrfs_block_group_used(&cache->item); 8631 cache->bytes_super - btrfs_block_group_used(&cache->item);
8634 sinfo->bytes_readonly -= num_bytes; 8632 sinfo->bytes_readonly -= num_bytes;
8635 cache->ro = 0; 8633 cache->ro = 0;
8634 list_del_init(&cache->ro_list);
8636 spin_unlock(&cache->lock); 8635 spin_unlock(&cache->lock);
8637 spin_unlock(&sinfo->lock); 8636 spin_unlock(&sinfo->lock);
8638} 8637}
@@ -9002,7 +9001,9 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
9002 INIT_LIST_HEAD(&cache->list); 9001 INIT_LIST_HEAD(&cache->list);
9003 INIT_LIST_HEAD(&cache->cluster_list); 9002 INIT_LIST_HEAD(&cache->cluster_list);
9004 INIT_LIST_HEAD(&cache->bg_list); 9003 INIT_LIST_HEAD(&cache->bg_list);
9004 INIT_LIST_HEAD(&cache->ro_list);
9005 btrfs_init_free_space_ctl(cache); 9005 btrfs_init_free_space_ctl(cache);
9006 atomic_set(&cache->trimming, 0);
9006 9007
9007 return cache; 9008 return cache;
9008} 9009}
@@ -9195,9 +9196,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9195 int ret = 0; 9196 int ret = 0;
9196 9197
9197 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { 9198 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
9198 list_del_init(&block_group->bg_list);
9199 if (ret) 9199 if (ret)
9200 continue; 9200 goto next;
9201 9201
9202 spin_lock(&block_group->lock); 9202 spin_lock(&block_group->lock);
9203 memcpy(&item, &block_group->item, sizeof(item)); 9203 memcpy(&item, &block_group->item, sizeof(item));
@@ -9212,6 +9212,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9212 key.objectid, key.offset); 9212 key.objectid, key.offset);
9213 if (ret) 9213 if (ret)
9214 btrfs_abort_transaction(trans, extent_root, ret); 9214 btrfs_abort_transaction(trans, extent_root, ret);
9215next:
9216 list_del_init(&block_group->bg_list);
9215 } 9217 }
9216} 9218}
9217 9219
@@ -9304,7 +9306,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
9304} 9306}
9305 9307
9306int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 9308int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9307 struct btrfs_root *root, u64 group_start) 9309 struct btrfs_root *root, u64 group_start,
9310 struct extent_map *em)
9308{ 9311{
9309 struct btrfs_path *path; 9312 struct btrfs_path *path;
9310 struct btrfs_block_group_cache *block_group; 9313 struct btrfs_block_group_cache *block_group;
@@ -9316,6 +9319,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9316 int ret; 9319 int ret;
9317 int index; 9320 int index;
9318 int factor; 9321 int factor;
9322 struct btrfs_caching_control *caching_ctl = NULL;
9323 bool remove_em;
9319 9324
9320 root = root->fs_info->extent_root; 9325 root = root->fs_info->extent_root;
9321 9326
@@ -9400,6 +9405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9400 spin_lock(&root->fs_info->block_group_cache_lock); 9405 spin_lock(&root->fs_info->block_group_cache_lock);
9401 rb_erase(&block_group->cache_node, 9406 rb_erase(&block_group->cache_node,
9402 &root->fs_info->block_group_cache_tree); 9407 &root->fs_info->block_group_cache_tree);
9408 RB_CLEAR_NODE(&block_group->cache_node);
9403 9409
9404 if (root->fs_info->first_logical_byte == block_group->key.objectid) 9410 if (root->fs_info->first_logical_byte == block_group->key.objectid)
9405 root->fs_info->first_logical_byte = (u64)-1; 9411 root->fs_info->first_logical_byte = (u64)-1;
@@ -9411,6 +9417,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9411 * are still on the list after taking the semaphore 9417 * are still on the list after taking the semaphore
9412 */ 9418 */
9413 list_del_init(&block_group->list); 9419 list_del_init(&block_group->list);
9420 list_del_init(&block_group->ro_list);
9414 if (list_empty(&block_group->space_info->block_groups[index])) { 9421 if (list_empty(&block_group->space_info->block_groups[index])) {
9415 kobj = block_group->space_info->block_group_kobjs[index]; 9422 kobj = block_group->space_info->block_group_kobjs[index];
9416 block_group->space_info->block_group_kobjs[index] = NULL; 9423 block_group->space_info->block_group_kobjs[index] = NULL;
@@ -9422,8 +9429,32 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9422 kobject_put(kobj); 9429 kobject_put(kobj);
9423 } 9430 }
9424 9431
9432 if (block_group->has_caching_ctl)
9433 caching_ctl = get_caching_control(block_group);
9425 if (block_group->cached == BTRFS_CACHE_STARTED) 9434 if (block_group->cached == BTRFS_CACHE_STARTED)
9426 wait_block_group_cache_done(block_group); 9435 wait_block_group_cache_done(block_group);
9436 if (block_group->has_caching_ctl) {
9437 down_write(&root->fs_info->commit_root_sem);
9438 if (!caching_ctl) {
9439 struct btrfs_caching_control *ctl;
9440
9441 list_for_each_entry(ctl,
9442 &root->fs_info->caching_block_groups, list)
9443 if (ctl->block_group == block_group) {
9444 caching_ctl = ctl;
9445 atomic_inc(&caching_ctl->count);
9446 break;
9447 }
9448 }
9449 if (caching_ctl)
9450 list_del_init(&caching_ctl->list);
9451 up_write(&root->fs_info->commit_root_sem);
9452 if (caching_ctl) {
9453 /* Once for the caching bgs list and once for us. */
9454 put_caching_control(caching_ctl);
9455 put_caching_control(caching_ctl);
9456 }
9457 }
9427 9458
9428 btrfs_remove_free_space_cache(block_group); 9459 btrfs_remove_free_space_cache(block_group);
9429 9460
@@ -9435,6 +9466,71 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9435 9466
9436 memcpy(&key, &block_group->key, sizeof(key)); 9467 memcpy(&key, &block_group->key, sizeof(key));
9437 9468
9469 lock_chunks(root);
9470 if (!list_empty(&em->list)) {
9471 /* We're in the transaction->pending_chunks list. */
9472 free_extent_map(em);
9473 }
9474 spin_lock(&block_group->lock);
9475 block_group->removed = 1;
9476 /*
9477 * At this point trimming can't start on this block group, because we
9478 * removed the block group from the tree fs_info->block_group_cache_tree
9479 * so no one can't find it anymore and even if someone already got this
9480 * block group before we removed it from the rbtree, they have already
9481 * incremented block_group->trimming - if they didn't, they won't find
9482 * any free space entries because we already removed them all when we
9483 * called btrfs_remove_free_space_cache().
9484 *
9485 * And we must not remove the extent map from the fs_info->mapping_tree
9486 * to prevent the same logical address range and physical device space
9487 * ranges from being reused for a new block group. This is because our
9488 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
9489 * completely transactionless, so while it is trimming a range the
9490 * currently running transaction might finish and a new one start,
9491 * allowing for new block groups to be created that can reuse the same
9492 * physical device locations unless we take this special care.
9493 */
9494 remove_em = (atomic_read(&block_group->trimming) == 0);
9495 /*
9496 * Make sure a trimmer task always sees the em in the pinned_chunks list
9497 * if it sees block_group->removed == 1 (needs to lock block_group->lock
9498 * before checking block_group->removed).
9499 */
9500 if (!remove_em) {
9501 /*
9502 * Our em might be in trans->transaction->pending_chunks which
9503 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
9504 * and so is the fs_info->pinned_chunks list.
9505 *
9506 * So at this point we must be holding the chunk_mutex to avoid
9507 * any races with chunk allocation (more specifically at
9508 * volumes.c:contains_pending_extent()), to ensure it always
9509 * sees the em, either in the pending_chunks list or in the
9510 * pinned_chunks list.
9511 */
9512 list_move_tail(&em->list, &root->fs_info->pinned_chunks);
9513 }
9514 spin_unlock(&block_group->lock);
9515
9516 if (remove_em) {
9517 struct extent_map_tree *em_tree;
9518
9519 em_tree = &root->fs_info->mapping_tree.map_tree;
9520 write_lock(&em_tree->lock);
9521 /*
9522 * The em might be in the pending_chunks list, so make sure the
9523 * chunk mutex is locked, since remove_extent_mapping() will
9524 * delete us from that list.
9525 */
9526 remove_extent_mapping(em_tree, em);
9527 write_unlock(&em_tree->lock);
9528 /* once for the tree */
9529 free_extent_map(em);
9530 }
9531
9532 unlock_chunks(root);
9533
9438 btrfs_put_block_group(block_group); 9534 btrfs_put_block_group(block_group);
9439 btrfs_put_block_group(block_group); 9535 btrfs_put_block_group(block_group);
9440 9536
@@ -9523,10 +9619,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9523 */ 9619 */
9524 start = block_group->key.objectid; 9620 start = block_group->key.objectid;
9525 end = start + block_group->key.offset - 1; 9621 end = start + block_group->key.offset - 1;
9526 clear_extent_bits(&fs_info->freed_extents[0], start, end, 9622 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
9527 EXTENT_DIRTY, GFP_NOFS); 9623 EXTENT_DIRTY, GFP_NOFS);
9528 clear_extent_bits(&fs_info->freed_extents[1], start, end, 9624 if (ret) {
9625 btrfs_set_block_group_rw(root, block_group);
9626 goto end_trans;
9627 }
9628 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
9529 EXTENT_DIRTY, GFP_NOFS); 9629 EXTENT_DIRTY, GFP_NOFS);
9630 if (ret) {
9631 btrfs_set_block_group_rw(root, block_group);
9632 goto end_trans;
9633 }
9530 9634
9531 /* Reset pinned so btrfs_put_block_group doesn't complain */ 9635 /* Reset pinned so btrfs_put_block_group doesn't complain */
9532 block_group->pinned = 0; 9636 block_group->pinned = 0;
@@ -9537,6 +9641,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9537 */ 9641 */
9538 ret = btrfs_remove_chunk(trans, root, 9642 ret = btrfs_remove_chunk(trans, root,
9539 block_group->key.objectid); 9643 block_group->key.objectid);
9644end_trans:
9540 btrfs_end_transaction(trans, root); 9645 btrfs_end_transaction(trans, root);
9541next: 9646next:
9542 btrfs_put_block_group(block_group); 9647 btrfs_put_block_group(block_group);
@@ -9657,12 +9762,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
9657} 9762}
9658 9763
9659/* 9764/*
9660 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), 9765 * btrfs_{start,end}_write_no_snapshoting() are similar to
9661 * they are used to prevent the some tasks writing data into the page cache 9766 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
9662 * by nocow before the subvolume is snapshoted, but flush the data into 9767 * data into the page cache through nocow before the subvolume is snapshoted,
9663 * the disk after the snapshot creation. 9768 * but flush the data into disk after the snapshot creation, or to prevent
9769 * operations while snapshoting is ongoing and that cause the snapshot to be
9770 * inconsistent (writes followed by expanding truncates for example).
9664 */ 9771 */
9665void btrfs_end_nocow_write(struct btrfs_root *root) 9772void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
9666{ 9773{
9667 percpu_counter_dec(&root->subv_writers->counter); 9774 percpu_counter_dec(&root->subv_writers->counter);
9668 /* 9775 /*
@@ -9674,7 +9781,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
9674 wake_up(&root->subv_writers->wait); 9781 wake_up(&root->subv_writers->wait);
9675} 9782}
9676 9783
9677int btrfs_start_nocow_write(struct btrfs_root *root) 9784int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
9678{ 9785{
9679 if (atomic_read(&root->will_be_snapshoted)) 9786 if (atomic_read(&root->will_be_snapshoted))
9680 return 0; 9787 return 0;
@@ -9685,7 +9792,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
9685 */ 9792 */
9686 smp_mb(); 9793 smp_mb();
9687 if (atomic_read(&root->will_be_snapshoted)) { 9794 if (atomic_read(&root->will_be_snapshoted)) {
9688 btrfs_end_nocow_write(root); 9795 btrfs_end_write_no_snapshoting(root);
9689 return 0; 9796 return 0;
9690 } 9797 }
9691 return 1; 9798 return 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bf3f424e0013..4ebabd237153 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
595 clear = 1; 595 clear = 1;
596again: 596again:
597 if (!prealloc && (mask & __GFP_WAIT)) { 597 if (!prealloc && (mask & __GFP_WAIT)) {
598 /*
599 * Don't care for allocation failure here because we might end
600 * up not needing the pre-allocated extent state at all, which
601 * is the case if we only have in the tree extent states that
602 * cover our input range and don't cover too any other range.
603 * If we end up needing a new extent state we allocate it later.
604 */
598 prealloc = alloc_extent_state(mask); 605 prealloc = alloc_extent_state(mask);
599 if (!prealloc)
600 return -ENOMEM;
601 } 606 }
602 607
603 spin_lock(&tree->lock); 608 spin_lock(&tree->lock);
@@ -796,17 +801,25 @@ static void set_state_bits(struct extent_io_tree *tree,
796 state->state |= bits_to_set; 801 state->state |= bits_to_set;
797} 802}
798 803
799static void cache_state(struct extent_state *state, 804static void cache_state_if_flags(struct extent_state *state,
800 struct extent_state **cached_ptr) 805 struct extent_state **cached_ptr,
806 const u64 flags)
801{ 807{
802 if (cached_ptr && !(*cached_ptr)) { 808 if (cached_ptr && !(*cached_ptr)) {
803 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 809 if (!flags || (state->state & flags)) {
804 *cached_ptr = state; 810 *cached_ptr = state;
805 atomic_inc(&state->refs); 811 atomic_inc(&state->refs);
806 } 812 }
807 } 813 }
808} 814}
809 815
816static void cache_state(struct extent_state *state,
817 struct extent_state **cached_ptr)
818{
819 return cache_state_if_flags(state, cached_ptr,
820 EXTENT_IOBITS | EXTENT_BOUNDARY);
821}
822
810/* 823/*
811 * set some bits on a range in the tree. This may require allocations or 824 * set some bits on a range in the tree. This may require allocations or
812 * sleeping, so the gfp mask is used to indicate what is allowed. 825 * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -1058,13 +1071,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1058 int err = 0; 1071 int err = 0;
1059 u64 last_start; 1072 u64 last_start;
1060 u64 last_end; 1073 u64 last_end;
1074 bool first_iteration = true;
1061 1075
1062 btrfs_debug_check_extent_io_range(tree, start, end); 1076 btrfs_debug_check_extent_io_range(tree, start, end);
1063 1077
1064again: 1078again:
1065 if (!prealloc && (mask & __GFP_WAIT)) { 1079 if (!prealloc && (mask & __GFP_WAIT)) {
1080 /*
1081 * Best effort, don't worry if extent state allocation fails
1082 * here for the first iteration. We might have a cached state
1083 * that matches exactly the target range, in which case no
1084 * extent state allocations are needed. We'll only know this
1085 * after locking the tree.
1086 */
1066 prealloc = alloc_extent_state(mask); 1087 prealloc = alloc_extent_state(mask);
1067 if (!prealloc) 1088 if (!prealloc && !first_iteration)
1068 return -ENOMEM; 1089 return -ENOMEM;
1069 } 1090 }
1070 1091
@@ -1234,6 +1255,7 @@ search_again:
1234 spin_unlock(&tree->lock); 1255 spin_unlock(&tree->lock);
1235 if (mask & __GFP_WAIT) 1256 if (mask & __GFP_WAIT)
1236 cond_resched(); 1257 cond_resched();
1258 first_iteration = false;
1237 goto again; 1259 goto again;
1238} 1260}
1239 1261
@@ -1482,7 +1504,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1482 state = find_first_extent_bit_state(tree, start, bits); 1504 state = find_first_extent_bit_state(tree, start, bits);
1483got_it: 1505got_it:
1484 if (state) { 1506 if (state) {
1485 cache_state(state, cached_state); 1507 cache_state_if_flags(state, cached_state, 0);
1486 *start_ret = state->start; 1508 *start_ret = state->start;
1487 *end_ret = state->end; 1509 *end_ret = state->end;
1488 ret = 0; 1510 ret = 0;
@@ -1746,6 +1768,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
1746 if (page_ops == 0) 1768 if (page_ops == 0)
1747 return 0; 1769 return 0;
1748 1770
1771 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1772 mapping_set_error(inode->i_mapping, -EIO);
1773
1749 while (nr_pages > 0) { 1774 while (nr_pages > 0) {
1750 ret = find_get_pages_contig(inode->i_mapping, index, 1775 ret = find_get_pages_contig(inode->i_mapping, index,
1751 min_t(unsigned long, 1776 min_t(unsigned long,
@@ -1763,6 +1788,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
1763 clear_page_dirty_for_io(pages[i]); 1788 clear_page_dirty_for_io(pages[i]);
1764 if (page_ops & PAGE_SET_WRITEBACK) 1789 if (page_ops & PAGE_SET_WRITEBACK)
1765 set_page_writeback(pages[i]); 1790 set_page_writeback(pages[i]);
1791 if (page_ops & PAGE_SET_ERROR)
1792 SetPageError(pages[i]);
1766 if (page_ops & PAGE_END_WRITEBACK) 1793 if (page_ops & PAGE_END_WRITEBACK)
1767 end_page_writeback(pages[i]); 1794 end_page_writeback(pages[i]);
1768 if (page_ops & PAGE_UNLOCK) 1795 if (page_ops & PAGE_UNLOCK)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6d4b938be986..ece9ce87edff 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -49,6 +49,7 @@
49#define PAGE_SET_WRITEBACK (1 << 2) 49#define PAGE_SET_WRITEBACK (1 << 2)
50#define PAGE_END_WRITEBACK (1 << 3) 50#define PAGE_END_WRITEBACK (1 << 3)
51#define PAGE_SET_PRIVATE2 (1 << 4) 51#define PAGE_SET_PRIVATE2 (1 << 4)
52#define PAGE_SET_ERROR (1 << 5)
52 53
53/* 54/*
54 * page->private values. Every page that is controlled by the extent 55 * page->private values. Every page that is controlled by the extent
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 225302b39afb..6a98bddd8f33 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -287,8 +287,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
287 if (!em) 287 if (!em)
288 goto out; 288 goto out;
289 289
290 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
291 list_move(&em->list, &tree->modified_extents);
292 em->generation = gen; 290 em->generation = gen;
293 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 291 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
294 em->mod_start = em->start; 292 em->mod_start = em->start;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a18ceabd99a8..e4090259569b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1428 u64 num_bytes; 1428 u64 num_bytes;
1429 int ret; 1429 int ret;
1430 1430
1431 ret = btrfs_start_nocow_write(root); 1431 ret = btrfs_start_write_no_snapshoting(root);
1432 if (!ret) 1432 if (!ret)
1433 return -ENOSPC; 1433 return -ENOSPC;
1434 1434
@@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1451 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); 1451 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1452 if (ret <= 0) { 1452 if (ret <= 0) {
1453 ret = 0; 1453 ret = 0;
1454 btrfs_end_nocow_write(root); 1454 btrfs_end_write_no_snapshoting(root);
1455 } else { 1455 } else {
1456 *write_bytes = min_t(size_t, *write_bytes , 1456 *write_bytes = min_t(size_t, *write_bytes ,
1457 num_bytes - pos + lockstart); 1457 num_bytes - pos + lockstart);
@@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1543 btrfs_free_reserved_data_space(inode, 1543 btrfs_free_reserved_data_space(inode,
1544 reserve_bytes); 1544 reserve_bytes);
1545 else 1545 else
1546 btrfs_end_nocow_write(root); 1546 btrfs_end_write_no_snapshoting(root);
1547 break; 1547 break;
1548 } 1548 }
1549 1549
@@ -1632,7 +1632,7 @@ again:
1632 1632
1633 release_bytes = 0; 1633 release_bytes = 0;
1634 if (only_release_metadata) 1634 if (only_release_metadata)
1635 btrfs_end_nocow_write(root); 1635 btrfs_end_write_no_snapshoting(root);
1636 1636
1637 if (only_release_metadata && copied > 0) { 1637 if (only_release_metadata && copied > 0) {
1638 u64 lockstart = round_down(pos, root->sectorsize); 1638 u64 lockstart = round_down(pos, root->sectorsize);
@@ -1661,7 +1661,7 @@ again:
1661 1661
1662 if (release_bytes) { 1662 if (release_bytes) {
1663 if (only_release_metadata) { 1663 if (only_release_metadata) {
1664 btrfs_end_nocow_write(root); 1664 btrfs_end_write_no_snapshoting(root);
1665 btrfs_delalloc_release_metadata(inode, release_bytes); 1665 btrfs_delalloc_release_metadata(inode, release_bytes);
1666 } else { 1666 } else {
1667 btrfs_delalloc_release_space(inode, release_bytes); 1667 btrfs_delalloc_release_space(inode, release_bytes);
@@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1676 loff_t pos) 1676 loff_t pos)
1677{ 1677{
1678 struct file *file = iocb->ki_filp; 1678 struct file *file = iocb->ki_filp;
1679 struct inode *inode = file_inode(file);
1679 ssize_t written; 1680 ssize_t written;
1680 ssize_t written_buffered; 1681 ssize_t written_buffered;
1681 loff_t endbyte; 1682 loff_t endbyte;
@@ -1692,8 +1693,15 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1692 err = written_buffered; 1693 err = written_buffered;
1693 goto out; 1694 goto out;
1694 } 1695 }
1696 /*
1697 * Ensure all data is persisted. We want the next direct IO read to be
1698 * able to read what was just written.
1699 */
1695 endbyte = pos + written_buffered - 1; 1700 endbyte = pos + written_buffered - 1;
1696 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); 1701 err = btrfs_fdatawrite_range(inode, pos, endbyte);
1702 if (err)
1703 goto out;
1704 err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1697 if (err) 1705 if (err)
1698 goto out; 1706 goto out;
1699 written += written_buffered; 1707 written += written_buffered;
@@ -1854,10 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1854 int ret; 1862 int ret;
1855 1863
1856 atomic_inc(&BTRFS_I(inode)->sync_writers); 1864 atomic_inc(&BTRFS_I(inode)->sync_writers);
1857 ret = filemap_fdatawrite_range(inode->i_mapping, start, end); 1865 ret = btrfs_fdatawrite_range(inode, start, end);
1858 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1859 &BTRFS_I(inode)->runtime_flags))
1860 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1861 atomic_dec(&BTRFS_I(inode)->sync_writers); 1866 atomic_dec(&BTRFS_I(inode)->sync_writers);
1862 1867
1863 return ret; 1868 return ret;
@@ -2810,3 +2815,29 @@ int btrfs_auto_defrag_init(void)
2810 2815
2811 return 0; 2816 return 0;
2812} 2817}
2818
2819int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
2820{
2821 int ret;
2822
2823 /*
2824 * So with compression we will find and lock a dirty page and clear the
2825 * first one as dirty, setup an async extent, and immediately return
2826 * with the entire range locked but with nobody actually marked with
2827 * writeback. So we can't just filemap_write_and_wait_range() and
2828 * expect it to work since it will just kick off a thread to do the
2829 * actual work. So we need to call filemap_fdatawrite_range _again_
2830 * since it will wait on the page lock, which won't be unlocked until
2831 * after the pages have been marked as writeback and so we're good to go
2832 * from there. We have to do this otherwise we'll miss the ordered
2833 * extents and that results in badness. Please Josef, do not think you
2834 * know better and pull this out at some point in the future, it is
2835 * right and you are wrong.
2836 */
2837 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
2838 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
2839 &BTRFS_I(inode)->runtime_flags))
2840 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
2841
2842 return ret;
2843}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 33848196550e..030847bf7cec 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -27,10 +27,17 @@
27#include "disk-io.h" 27#include "disk-io.h"
28#include "extent_io.h" 28#include "extent_io.h"
29#include "inode-map.h" 29#include "inode-map.h"
30#include "volumes.h"
30 31
31#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 32#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
32#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 33#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
33 34
35struct btrfs_trim_range {
36 u64 start;
37 u64 bytes;
38 struct list_head list;
39};
40
34static int link_free_space(struct btrfs_free_space_ctl *ctl, 41static int link_free_space(struct btrfs_free_space_ctl *ctl,
35 struct btrfs_free_space *info); 42 struct btrfs_free_space *info);
36static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 43static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
@@ -881,6 +888,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
881 int ret; 888 int ret;
882 struct btrfs_free_cluster *cluster = NULL; 889 struct btrfs_free_cluster *cluster = NULL;
883 struct rb_node *node = rb_first(&ctl->free_space_offset); 890 struct rb_node *node = rb_first(&ctl->free_space_offset);
891 struct btrfs_trim_range *trim_entry;
884 892
885 /* Get the cluster for this block_group if it exists */ 893 /* Get the cluster for this block_group if it exists */
886 if (block_group && !list_empty(&block_group->cluster_list)) { 894 if (block_group && !list_empty(&block_group->cluster_list)) {
@@ -916,6 +924,21 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
916 cluster = NULL; 924 cluster = NULL;
917 } 925 }
918 } 926 }
927
928 /*
929 * Make sure we don't miss any range that was removed from our rbtree
930 * because trimming is running. Otherwise after a umount+mount (or crash
931 * after committing the transaction) we would leak free space and get
932 * an inconsistent free space cache report from fsck.
933 */
934 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
935 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
936 trim_entry->bytes, NULL);
937 if (ret)
938 goto fail;
939 *entries += 1;
940 }
941
919 return 0; 942 return 0;
920fail: 943fail:
921 return -ENOSPC; 944 return -ENOSPC;
@@ -1135,12 +1158,15 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1135 1158
1136 io_ctl_set_generation(&io_ctl, trans->transid); 1159 io_ctl_set_generation(&io_ctl, trans->transid);
1137 1160
1161 mutex_lock(&ctl->cache_writeout_mutex);
1138 /* Write out the extent entries in the free space cache */ 1162 /* Write out the extent entries in the free space cache */
1139 ret = write_cache_extent_entries(&io_ctl, ctl, 1163 ret = write_cache_extent_entries(&io_ctl, ctl,
1140 block_group, &entries, &bitmaps, 1164 block_group, &entries, &bitmaps,
1141 &bitmap_list); 1165 &bitmap_list);
1142 if (ret) 1166 if (ret) {
1167 mutex_unlock(&ctl->cache_writeout_mutex);
1143 goto out_nospc; 1168 goto out_nospc;
1169 }
1144 1170
1145 /* 1171 /*
1146 * Some spaces that are freed in the current transaction are pinned, 1172 * Some spaces that are freed in the current transaction are pinned,
@@ -1148,11 +1174,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1148 * committed, we shouldn't lose them. 1174 * committed, we shouldn't lose them.
1149 */ 1175 */
1150 ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); 1176 ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
1151 if (ret) 1177 if (ret) {
1178 mutex_unlock(&ctl->cache_writeout_mutex);
1152 goto out_nospc; 1179 goto out_nospc;
1180 }
1153 1181
1154 /* At last, we write out all the bitmaps. */ 1182 /*
1183 * At last, we write out all the bitmaps and keep cache_writeout_mutex
1184 * locked while doing it because a concurrent trim can be manipulating
1185 * or freeing the bitmap.
1186 */
1155 ret = write_bitmap_entries(&io_ctl, &bitmap_list); 1187 ret = write_bitmap_entries(&io_ctl, &bitmap_list);
1188 mutex_unlock(&ctl->cache_writeout_mutex);
1156 if (ret) 1189 if (ret)
1157 goto out_nospc; 1190 goto out_nospc;
1158 1191
@@ -2295,6 +2328,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
2295 ctl->start = block_group->key.objectid; 2328 ctl->start = block_group->key.objectid;
2296 ctl->private = block_group; 2329 ctl->private = block_group;
2297 ctl->op = &free_space_op; 2330 ctl->op = &free_space_op;
2331 INIT_LIST_HEAD(&ctl->trimming_ranges);
2332 mutex_init(&ctl->cache_writeout_mutex);
2298 2333
2299 /* 2334 /*
2300 * we only want to have 32k of ram per block group for keeping 2335 * we only want to have 32k of ram per block group for keeping
@@ -2911,10 +2946,12 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2911 2946
2912static int do_trimming(struct btrfs_block_group_cache *block_group, 2947static int do_trimming(struct btrfs_block_group_cache *block_group,
2913 u64 *total_trimmed, u64 start, u64 bytes, 2948 u64 *total_trimmed, u64 start, u64 bytes,
2914 u64 reserved_start, u64 reserved_bytes) 2949 u64 reserved_start, u64 reserved_bytes,
2950 struct btrfs_trim_range *trim_entry)
2915{ 2951{
2916 struct btrfs_space_info *space_info = block_group->space_info; 2952 struct btrfs_space_info *space_info = block_group->space_info;
2917 struct btrfs_fs_info *fs_info = block_group->fs_info; 2953 struct btrfs_fs_info *fs_info = block_group->fs_info;
2954 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2918 int ret; 2955 int ret;
2919 int update = 0; 2956 int update = 0;
2920 u64 trimmed = 0; 2957 u64 trimmed = 0;
@@ -2934,7 +2971,10 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
2934 if (!ret) 2971 if (!ret)
2935 *total_trimmed += trimmed; 2972 *total_trimmed += trimmed;
2936 2973
2974 mutex_lock(&ctl->cache_writeout_mutex);
2937 btrfs_add_free_space(block_group, reserved_start, reserved_bytes); 2975 btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
2976 list_del(&trim_entry->list);
2977 mutex_unlock(&ctl->cache_writeout_mutex);
2938 2978
2939 if (update) { 2979 if (update) {
2940 spin_lock(&space_info->lock); 2980 spin_lock(&space_info->lock);
@@ -2962,16 +3002,21 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2962 u64 bytes; 3002 u64 bytes;
2963 3003
2964 while (start < end) { 3004 while (start < end) {
3005 struct btrfs_trim_range trim_entry;
3006
3007 mutex_lock(&ctl->cache_writeout_mutex);
2965 spin_lock(&ctl->tree_lock); 3008 spin_lock(&ctl->tree_lock);
2966 3009
2967 if (ctl->free_space < minlen) { 3010 if (ctl->free_space < minlen) {
2968 spin_unlock(&ctl->tree_lock); 3011 spin_unlock(&ctl->tree_lock);
3012 mutex_unlock(&ctl->cache_writeout_mutex);
2969 break; 3013 break;
2970 } 3014 }
2971 3015
2972 entry = tree_search_offset(ctl, start, 0, 1); 3016 entry = tree_search_offset(ctl, start, 0, 1);
2973 if (!entry) { 3017 if (!entry) {
2974 spin_unlock(&ctl->tree_lock); 3018 spin_unlock(&ctl->tree_lock);
3019 mutex_unlock(&ctl->cache_writeout_mutex);
2975 break; 3020 break;
2976 } 3021 }
2977 3022
@@ -2980,6 +3025,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2980 node = rb_next(&entry->offset_index); 3025 node = rb_next(&entry->offset_index);
2981 if (!node) { 3026 if (!node) {
2982 spin_unlock(&ctl->tree_lock); 3027 spin_unlock(&ctl->tree_lock);
3028 mutex_unlock(&ctl->cache_writeout_mutex);
2983 goto out; 3029 goto out;
2984 } 3030 }
2985 entry = rb_entry(node, struct btrfs_free_space, 3031 entry = rb_entry(node, struct btrfs_free_space,
@@ -2988,6 +3034,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2988 3034
2989 if (entry->offset >= end) { 3035 if (entry->offset >= end) {
2990 spin_unlock(&ctl->tree_lock); 3036 spin_unlock(&ctl->tree_lock);
3037 mutex_unlock(&ctl->cache_writeout_mutex);
2991 break; 3038 break;
2992 } 3039 }
2993 3040
@@ -2997,6 +3044,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2997 bytes = min(extent_start + extent_bytes, end) - start; 3044 bytes = min(extent_start + extent_bytes, end) - start;
2998 if (bytes < minlen) { 3045 if (bytes < minlen) {
2999 spin_unlock(&ctl->tree_lock); 3046 spin_unlock(&ctl->tree_lock);
3047 mutex_unlock(&ctl->cache_writeout_mutex);
3000 goto next; 3048 goto next;
3001 } 3049 }
3002 3050
@@ -3004,9 +3052,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
3004 kmem_cache_free(btrfs_free_space_cachep, entry); 3052 kmem_cache_free(btrfs_free_space_cachep, entry);
3005 3053
3006 spin_unlock(&ctl->tree_lock); 3054 spin_unlock(&ctl->tree_lock);
3055 trim_entry.start = extent_start;
3056 trim_entry.bytes = extent_bytes;
3057 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3058 mutex_unlock(&ctl->cache_writeout_mutex);
3007 3059
3008 ret = do_trimming(block_group, total_trimmed, start, bytes, 3060 ret = do_trimming(block_group, total_trimmed, start, bytes,
3009 extent_start, extent_bytes); 3061 extent_start, extent_bytes, &trim_entry);
3010 if (ret) 3062 if (ret)
3011 break; 3063 break;
3012next: 3064next:
@@ -3035,17 +3087,21 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3035 3087
3036 while (offset < end) { 3088 while (offset < end) {
3037 bool next_bitmap = false; 3089 bool next_bitmap = false;
3090 struct btrfs_trim_range trim_entry;
3038 3091
3092 mutex_lock(&ctl->cache_writeout_mutex);
3039 spin_lock(&ctl->tree_lock); 3093 spin_lock(&ctl->tree_lock);
3040 3094
3041 if (ctl->free_space < minlen) { 3095 if (ctl->free_space < minlen) {
3042 spin_unlock(&ctl->tree_lock); 3096 spin_unlock(&ctl->tree_lock);
3097 mutex_unlock(&ctl->cache_writeout_mutex);
3043 break; 3098 break;
3044 } 3099 }
3045 3100
3046 entry = tree_search_offset(ctl, offset, 1, 0); 3101 entry = tree_search_offset(ctl, offset, 1, 0);
3047 if (!entry) { 3102 if (!entry) {
3048 spin_unlock(&ctl->tree_lock); 3103 spin_unlock(&ctl->tree_lock);
3104 mutex_unlock(&ctl->cache_writeout_mutex);
3049 next_bitmap = true; 3105 next_bitmap = true;
3050 goto next; 3106 goto next;
3051 } 3107 }
@@ -3054,6 +3110,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3054 ret2 = search_bitmap(ctl, entry, &start, &bytes); 3110 ret2 = search_bitmap(ctl, entry, &start, &bytes);
3055 if (ret2 || start >= end) { 3111 if (ret2 || start >= end) {
3056 spin_unlock(&ctl->tree_lock); 3112 spin_unlock(&ctl->tree_lock);
3113 mutex_unlock(&ctl->cache_writeout_mutex);
3057 next_bitmap = true; 3114 next_bitmap = true;
3058 goto next; 3115 goto next;
3059 } 3116 }
@@ -3061,6 +3118,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3061 bytes = min(bytes, end - start); 3118 bytes = min(bytes, end - start);
3062 if (bytes < minlen) { 3119 if (bytes < minlen) {
3063 spin_unlock(&ctl->tree_lock); 3120 spin_unlock(&ctl->tree_lock);
3121 mutex_unlock(&ctl->cache_writeout_mutex);
3064 goto next; 3122 goto next;
3065 } 3123 }
3066 3124
@@ -3069,9 +3127,13 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3069 free_bitmap(ctl, entry); 3127 free_bitmap(ctl, entry);
3070 3128
3071 spin_unlock(&ctl->tree_lock); 3129 spin_unlock(&ctl->tree_lock);
3130 trim_entry.start = start;
3131 trim_entry.bytes = bytes;
3132 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3133 mutex_unlock(&ctl->cache_writeout_mutex);
3072 3134
3073 ret = do_trimming(block_group, total_trimmed, start, bytes, 3135 ret = do_trimming(block_group, total_trimmed, start, bytes,
3074 start, bytes); 3136 start, bytes, &trim_entry);
3075 if (ret) 3137 if (ret)
3076 break; 3138 break;
3077next: 3139next:
@@ -3101,11 +3163,52 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
3101 3163
3102 *trimmed = 0; 3164 *trimmed = 0;
3103 3165
3166 spin_lock(&block_group->lock);
3167 if (block_group->removed) {
3168 spin_unlock(&block_group->lock);
3169 return 0;
3170 }
3171 atomic_inc(&block_group->trimming);
3172 spin_unlock(&block_group->lock);
3173
3104 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); 3174 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
3105 if (ret) 3175 if (ret)
3106 return ret; 3176 goto out;
3107 3177
3108 ret = trim_bitmaps(block_group, trimmed, start, end, minlen); 3178 ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
3179out:
3180 spin_lock(&block_group->lock);
3181 if (atomic_dec_and_test(&block_group->trimming) &&
3182 block_group->removed) {
3183 struct extent_map_tree *em_tree;
3184 struct extent_map *em;
3185
3186 spin_unlock(&block_group->lock);
3187
3188 em_tree = &block_group->fs_info->mapping_tree.map_tree;
3189 write_lock(&em_tree->lock);
3190 em = lookup_extent_mapping(em_tree, block_group->key.objectid,
3191 1);
3192 BUG_ON(!em); /* logic error, can't happen */
3193 remove_extent_mapping(em_tree, em);
3194 write_unlock(&em_tree->lock);
3195
3196 lock_chunks(block_group->fs_info->chunk_root);
3197 list_del_init(&em->list);
3198 unlock_chunks(block_group->fs_info->chunk_root);
3199
3200 /* once for us and once for the tree */
3201 free_extent_map(em);
3202 free_extent_map(em);
3203
3204 /*
3205 * We've left one free space entry and other tasks trimming
3206 * this block group have left 1 entry each one. Free them.
3207 */
3208 __btrfs_remove_free_space_cache(block_group->free_space_ctl);
3209 } else {
3210 spin_unlock(&block_group->lock);
3211 }
3109 3212
3110 return ret; 3213 return ret;
3111} 3214}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 0cf4977ef70d..88b2238a0aed 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -38,6 +38,8 @@ struct btrfs_free_space_ctl {
38 u64 start; 38 u64 start;
39 struct btrfs_free_space_op *op; 39 struct btrfs_free_space_op *op;
40 void *private; 40 void *private;
41 struct mutex cache_writeout_mutex;
42 struct list_head trimming_ranges;
41}; 43};
42 44
43struct btrfs_free_space_op { 45struct btrfs_free_space_op {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 83d646bd2e4b..74faea3a516e 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root)
178 root->root_key.objectid); 178 root->root_key.objectid);
179 if (IS_ERR(tsk)) { 179 if (IS_ERR(tsk)) {
180 btrfs_warn(root->fs_info, "failed to start inode caching task"); 180 btrfs_warn(root->fs_info, "failed to start inode caching task");
181 btrfs_clear_and_info(root, CHANGE_INODE_CACHE, 181 btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE,
182 "disabling inode map caching"); 182 "disabling inode map caching");
183 } 183 }
184} 184}
@@ -364,6 +364,8 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root)
364 ctl->start = 0; 364 ctl->start = 0;
365 ctl->private = NULL; 365 ctl->private = NULL;
366 ctl->op = &free_ino_op; 366 ctl->op = &free_ino_op;
367 INIT_LIST_HEAD(&ctl->trimming_ranges);
368 mutex_init(&ctl->cache_writeout_mutex);
367 369
368 /* 370 /*
369 * Initially we allow to use 16K of ram to cache chunks of 371 * Initially we allow to use 16K of ram to cache chunks of
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ff0dcc016b71..e687bb0dc73a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -382,7 +382,7 @@ static inline int inode_need_compress(struct inode *inode)
382 * are written in the same order that the flusher thread sent them 382 * are written in the same order that the flusher thread sent them
383 * down. 383 * down.
384 */ 384 */
385static noinline int compress_file_range(struct inode *inode, 385static noinline void compress_file_range(struct inode *inode,
386 struct page *locked_page, 386 struct page *locked_page,
387 u64 start, u64 end, 387 u64 start, u64 end,
388 struct async_cow *async_cow, 388 struct async_cow *async_cow,
@@ -411,14 +411,6 @@ static noinline int compress_file_range(struct inode *inode,
411 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 411 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
412 btrfs_add_inode_defrag(NULL, inode); 412 btrfs_add_inode_defrag(NULL, inode);
413 413
414 /*
415 * skip compression for a small file range(<=blocksize) that
416 * isn't an inline extent, since it dosen't save disk space at all.
417 */
418 if ((end - start + 1) <= blocksize &&
419 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
420 goto cleanup_and_bail_uncompressed;
421
422 actual_end = min_t(u64, isize, end + 1); 414 actual_end = min_t(u64, isize, end + 1);
423again: 415again:
424 will_compress = 0; 416 will_compress = 0;
@@ -440,6 +432,14 @@ again:
440 432
441 total_compressed = actual_end - start; 433 total_compressed = actual_end - start;
442 434
435 /*
436 * skip compression for a small file range(<=blocksize) that
437 * isn't an inline extent, since it dosen't save disk space at all.
438 */
439 if (total_compressed <= blocksize &&
440 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
441 goto cleanup_and_bail_uncompressed;
442
443 /* we want to make sure that amount of ram required to uncompress 443 /* we want to make sure that amount of ram required to uncompress
444 * an extent is reasonable, so we limit the total size in ram 444 * an extent is reasonable, so we limit the total size in ram
445 * of a compressed extent to 128k. This is a crucial number 445 * of a compressed extent to 128k. This is a crucial number
@@ -527,7 +527,10 @@ cont:
527 if (ret <= 0) { 527 if (ret <= 0) {
528 unsigned long clear_flags = EXTENT_DELALLOC | 528 unsigned long clear_flags = EXTENT_DELALLOC |
529 EXTENT_DEFRAG; 529 EXTENT_DEFRAG;
530 unsigned long page_error_op;
531
530 clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; 532 clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
533 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
531 534
532 /* 535 /*
533 * inline extent creation worked or returned error, 536 * inline extent creation worked or returned error,
@@ -538,6 +541,7 @@ cont:
538 clear_flags, PAGE_UNLOCK | 541 clear_flags, PAGE_UNLOCK |
539 PAGE_CLEAR_DIRTY | 542 PAGE_CLEAR_DIRTY |
540 PAGE_SET_WRITEBACK | 543 PAGE_SET_WRITEBACK |
544 page_error_op |
541 PAGE_END_WRITEBACK); 545 PAGE_END_WRITEBACK);
542 goto free_pages_out; 546 goto free_pages_out;
543 } 547 }
@@ -620,8 +624,7 @@ cleanup_and_bail_uncompressed:
620 *num_added += 1; 624 *num_added += 1;
621 } 625 }
622 626
623out: 627 return;
624 return ret;
625 628
626free_pages_out: 629free_pages_out:
627 for (i = 0; i < nr_pages_ret; i++) { 630 for (i = 0; i < nr_pages_ret; i++) {
@@ -629,8 +632,22 @@ free_pages_out:
629 page_cache_release(pages[i]); 632 page_cache_release(pages[i]);
630 } 633 }
631 kfree(pages); 634 kfree(pages);
635}
632 636
633 goto out; 637static void free_async_extent_pages(struct async_extent *async_extent)
638{
639 int i;
640
641 if (!async_extent->pages)
642 return;
643
644 for (i = 0; i < async_extent->nr_pages; i++) {
645 WARN_ON(async_extent->pages[i]->mapping);
646 page_cache_release(async_extent->pages[i]);
647 }
648 kfree(async_extent->pages);
649 async_extent->nr_pages = 0;
650 async_extent->pages = NULL;
634} 651}
635 652
636/* 653/*
@@ -639,7 +656,7 @@ free_pages_out:
639 * queued. We walk all the async extents created by compress_file_range 656 * queued. We walk all the async extents created by compress_file_range
640 * and send them down to the disk. 657 * and send them down to the disk.
641 */ 658 */
642static noinline int submit_compressed_extents(struct inode *inode, 659static noinline void submit_compressed_extents(struct inode *inode,
643 struct async_cow *async_cow) 660 struct async_cow *async_cow)
644{ 661{
645 struct async_extent *async_extent; 662 struct async_extent *async_extent;
@@ -651,9 +668,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
651 struct extent_io_tree *io_tree; 668 struct extent_io_tree *io_tree;
652 int ret = 0; 669 int ret = 0;
653 670
654 if (list_empty(&async_cow->extents))
655 return 0;
656
657again: 671again:
658 while (!list_empty(&async_cow->extents)) { 672 while (!list_empty(&async_cow->extents)) {
659 async_extent = list_entry(async_cow->extents.next, 673 async_extent = list_entry(async_cow->extents.next,
@@ -709,15 +723,7 @@ retry:
709 async_extent->compressed_size, 723 async_extent->compressed_size,
710 0, alloc_hint, &ins, 1, 1); 724 0, alloc_hint, &ins, 1, 1);
711 if (ret) { 725 if (ret) {
712 int i; 726 free_async_extent_pages(async_extent);
713
714 for (i = 0; i < async_extent->nr_pages; i++) {
715 WARN_ON(async_extent->pages[i]->mapping);
716 page_cache_release(async_extent->pages[i]);
717 }
718 kfree(async_extent->pages);
719 async_extent->nr_pages = 0;
720 async_extent->pages = NULL;
721 727
722 if (ret == -ENOSPC) { 728 if (ret == -ENOSPC) {
723 unlock_extent(io_tree, async_extent->start, 729 unlock_extent(io_tree, async_extent->start,
@@ -814,15 +820,26 @@ retry:
814 ins.objectid, 820 ins.objectid,
815 ins.offset, async_extent->pages, 821 ins.offset, async_extent->pages,
816 async_extent->nr_pages); 822 async_extent->nr_pages);
823 if (ret) {
824 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
825 struct page *p = async_extent->pages[0];
826 const u64 start = async_extent->start;
827 const u64 end = start + async_extent->ram_size - 1;
828
829 p->mapping = inode->i_mapping;
830 tree->ops->writepage_end_io_hook(p, start, end,
831 NULL, 0);
832 p->mapping = NULL;
833 extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
834 PAGE_END_WRITEBACK |
835 PAGE_SET_ERROR);
836 free_async_extent_pages(async_extent);
837 }
817 alloc_hint = ins.objectid + ins.offset; 838 alloc_hint = ins.objectid + ins.offset;
818 kfree(async_extent); 839 kfree(async_extent);
819 if (ret)
820 goto out;
821 cond_resched(); 840 cond_resched();
822 } 841 }
823 ret = 0; 842 return;
824out:
825 return ret;
826out_free_reserve: 843out_free_reserve:
827 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 844 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
828out_free: 845out_free:
@@ -832,7 +849,9 @@ out_free:
832 NULL, EXTENT_LOCKED | EXTENT_DELALLOC | 849 NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
833 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 850 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
834 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 851 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
835 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); 852 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
853 PAGE_SET_ERROR);
854 free_async_extent_pages(async_extent);
836 kfree(async_extent); 855 kfree(async_extent);
837 goto again; 856 goto again;
838} 857}
@@ -1318,7 +1337,7 @@ next_slot:
1318 * we fall into common COW way. 1337 * we fall into common COW way.
1319 */ 1338 */
1320 if (!nolock) { 1339 if (!nolock) {
1321 err = btrfs_start_nocow_write(root); 1340 err = btrfs_start_write_no_snapshoting(root);
1322 if (!err) 1341 if (!err)
1323 goto out_check; 1342 goto out_check;
1324 } 1343 }
@@ -1342,7 +1361,7 @@ out_check:
1342 if (extent_end <= start) { 1361 if (extent_end <= start) {
1343 path->slots[0]++; 1362 path->slots[0]++;
1344 if (!nolock && nocow) 1363 if (!nolock && nocow)
1345 btrfs_end_nocow_write(root); 1364 btrfs_end_write_no_snapshoting(root);
1346 goto next_slot; 1365 goto next_slot;
1347 } 1366 }
1348 if (!nocow) { 1367 if (!nocow) {
@@ -1362,7 +1381,7 @@ out_check:
1362 page_started, nr_written, 1); 1381 page_started, nr_written, 1);
1363 if (ret) { 1382 if (ret) {
1364 if (!nolock && nocow) 1383 if (!nolock && nocow)
1365 btrfs_end_nocow_write(root); 1384 btrfs_end_write_no_snapshoting(root);
1366 goto error; 1385 goto error;
1367 } 1386 }
1368 cow_start = (u64)-1; 1387 cow_start = (u64)-1;
@@ -1413,7 +1432,7 @@ out_check:
1413 num_bytes); 1432 num_bytes);
1414 if (ret) { 1433 if (ret) {
1415 if (!nolock && nocow) 1434 if (!nolock && nocow)
1416 btrfs_end_nocow_write(root); 1435 btrfs_end_write_no_snapshoting(root);
1417 goto error; 1436 goto error;
1418 } 1437 }
1419 } 1438 }
@@ -1424,7 +1443,7 @@ out_check:
1424 EXTENT_DELALLOC, PAGE_UNLOCK | 1443 EXTENT_DELALLOC, PAGE_UNLOCK |
1425 PAGE_SET_PRIVATE2); 1444 PAGE_SET_PRIVATE2);
1426 if (!nolock && nocow) 1445 if (!nolock && nocow)
1427 btrfs_end_nocow_write(root); 1446 btrfs_end_write_no_snapshoting(root);
1428 cur_offset = extent_end; 1447 cur_offset = extent_end;
1429 if (cur_offset > end) 1448 if (cur_offset > end)
1430 break; 1449 break;
@@ -4580,6 +4599,26 @@ next:
4580 return err; 4599 return err;
4581} 4600}
4582 4601
4602static int wait_snapshoting_atomic_t(atomic_t *a)
4603{
4604 schedule();
4605 return 0;
4606}
4607
4608static void wait_for_snapshot_creation(struct btrfs_root *root)
4609{
4610 while (true) {
4611 int ret;
4612
4613 ret = btrfs_start_write_no_snapshoting(root);
4614 if (ret)
4615 break;
4616 wait_on_atomic_t(&root->will_be_snapshoted,
4617 wait_snapshoting_atomic_t,
4618 TASK_UNINTERRUPTIBLE);
4619 }
4620}
4621
4583static int btrfs_setsize(struct inode *inode, struct iattr *attr) 4622static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4584{ 4623{
4585 struct btrfs_root *root = BTRFS_I(inode)->root; 4624 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4604,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4604 4643
4605 if (newsize > oldsize) { 4644 if (newsize > oldsize) {
4606 truncate_pagecache(inode, newsize); 4645 truncate_pagecache(inode, newsize);
4646 /*
4647 * Don't do an expanding truncate while snapshoting is ongoing.
4648 * This is to ensure the snapshot captures a fully consistent
4649 * state of this file - if the snapshot captures this expanding
4650 * truncation, it must capture all writes that happened before
4651 * this truncation.
4652 */
4653 wait_for_snapshot_creation(root);
4607 ret = btrfs_cont_expand(inode, oldsize, newsize); 4654 ret = btrfs_cont_expand(inode, oldsize, newsize);
4608 if (ret) 4655 if (ret) {
4656 btrfs_end_write_no_snapshoting(root);
4609 return ret; 4657 return ret;
4658 }
4610 4659
4611 trans = btrfs_start_transaction(root, 1); 4660 trans = btrfs_start_transaction(root, 1);
4612 if (IS_ERR(trans)) 4661 if (IS_ERR(trans)) {
4662 btrfs_end_write_no_snapshoting(root);
4613 return PTR_ERR(trans); 4663 return PTR_ERR(trans);
4664 }
4614 4665
4615 i_size_write(inode, newsize); 4666 i_size_write(inode, newsize);
4616 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 4667 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4617 ret = btrfs_update_inode(trans, root, inode); 4668 ret = btrfs_update_inode(trans, root, inode);
4669 btrfs_end_write_no_snapshoting(root);
4618 btrfs_end_transaction(trans, root); 4670 btrfs_end_transaction(trans, root);
4619 } else { 4671 } else {
4620 4672
@@ -7000,9 +7052,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7000 btrfs_put_ordered_extent(ordered); 7052 btrfs_put_ordered_extent(ordered);
7001 } else { 7053 } else {
7002 /* Screw you mmap */ 7054 /* Screw you mmap */
7003 ret = filemap_write_and_wait_range(inode->i_mapping, 7055 ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
7004 lockstart, 7056 if (ret)
7005 lockend); 7057 break;
7058 ret = filemap_fdatawait_range(inode->i_mapping,
7059 lockstart,
7060 lockend);
7006 if (ret) 7061 if (ret)
7007 break; 7062 break;
7008 7063
@@ -9442,6 +9497,21 @@ out_inode:
9442 9497
9443} 9498}
9444 9499
9500/* Inspired by filemap_check_errors() */
9501int btrfs_inode_check_errors(struct inode *inode)
9502{
9503 int ret = 0;
9504
9505 if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
9506 test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
9507 ret = -ENOSPC;
9508 if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
9509 test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
9510 ret = -EIO;
9511
9512 return ret;
9513}
9514
9445static const struct inode_operations btrfs_dir_inode_operations = { 9515static const struct inode_operations btrfs_dir_inode_operations = {
9446 .getattr = btrfs_getattr, 9516 .getattr = btrfs_getattr,
9447 .lookup = btrfs_lookup, 9517 .lookup = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 080fe66c0349..d49fe8a0f6b5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -617,7 +617,7 @@ fail:
617 return ret; 617 return ret;
618} 618}
619 619
620static void btrfs_wait_nocow_write(struct btrfs_root *root) 620static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
621{ 621{
622 s64 writers; 622 s64 writers;
623 DEFINE_WAIT(wait); 623 DEFINE_WAIT(wait);
@@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
649 649
650 atomic_inc(&root->will_be_snapshoted); 650 atomic_inc(&root->will_be_snapshoted);
651 smp_mb__after_atomic(); 651 smp_mb__after_atomic();
652 btrfs_wait_nocow_write(root); 652 btrfs_wait_for_no_snapshoting_writes(root);
653 653
654 ret = btrfs_start_delalloc_inodes(root, 0); 654 ret = btrfs_start_delalloc_inodes(root, 0);
655 if (ret) 655 if (ret)
@@ -717,35 +717,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
717 if (ret) 717 if (ret)
718 goto fail; 718 goto fail;
719 719
720 /*
721 * If orphan cleanup did remove any orphans, it means the tree was
722 * modified and therefore the commit root is not the same as the
723 * current root anymore. This is a problem, because send uses the
724 * commit root and therefore can see inode items that don't exist
725 * in the current root anymore, and for example make calls to
726 * btrfs_iget, which will do tree lookups based on the current root
727 * and not on the commit root. Those lookups will fail, returning a
728 * -ESTALE error, and making send fail with that error. So make sure
729 * a send does not see any orphans we have just removed, and that it
730 * will see the same inodes regardless of whether a transaction
731 * commit happened before it started (meaning that the commit root
732 * will be the same as the current root) or not.
733 */
734 if (readonly && pending_snapshot->snap->node !=
735 pending_snapshot->snap->commit_root) {
736 trans = btrfs_join_transaction(pending_snapshot->snap);
737 if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
738 ret = PTR_ERR(trans);
739 goto fail;
740 }
741 if (!IS_ERR(trans)) {
742 ret = btrfs_commit_transaction(trans,
743 pending_snapshot->snap);
744 if (ret)
745 goto fail;
746 }
747 }
748
749 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 720 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
750 if (IS_ERR(inode)) { 721 if (IS_ERR(inode)) {
751 ret = PTR_ERR(inode); 722 ret = PTR_ERR(inode);
@@ -761,7 +732,8 @@ fail:
761free: 732free:
762 kfree(pending_snapshot); 733 kfree(pending_snapshot);
763out: 734out:
764 atomic_dec(&root->will_be_snapshoted); 735 if (atomic_dec_and_test(&root->will_be_snapshoted))
736 wake_up_atomic_t(&root->will_be_snapshoted);
765 return ret; 737 return ret;
766} 738}
767 739
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ac734ec4cc20..534544e08f76 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 INIT_LIST_HEAD(&entry->work_list); 220 INIT_LIST_HEAD(&entry->work_list);
221 init_completion(&entry->completion); 221 init_completion(&entry->completion);
222 INIT_LIST_HEAD(&entry->log_list); 222 INIT_LIST_HEAD(&entry->log_list);
223 INIT_LIST_HEAD(&entry->trans_list);
223 224
224 trace_btrfs_ordered_extent_add(inode, entry); 225 trace_btrfs_ordered_extent_add(inode, entry);
225 226
@@ -431,19 +432,31 @@ out:
431 432
432/* Needs to either be called under a log transaction or the log_mutex */ 433/* Needs to either be called under a log transaction or the log_mutex */
433void btrfs_get_logged_extents(struct inode *inode, 434void btrfs_get_logged_extents(struct inode *inode,
434 struct list_head *logged_list) 435 struct list_head *logged_list,
436 const loff_t start,
437 const loff_t end)
435{ 438{
436 struct btrfs_ordered_inode_tree *tree; 439 struct btrfs_ordered_inode_tree *tree;
437 struct btrfs_ordered_extent *ordered; 440 struct btrfs_ordered_extent *ordered;
438 struct rb_node *n; 441 struct rb_node *n;
442 struct rb_node *prev;
439 443
440 tree = &BTRFS_I(inode)->ordered_tree; 444 tree = &BTRFS_I(inode)->ordered_tree;
441 spin_lock_irq(&tree->lock); 445 spin_lock_irq(&tree->lock);
442 for (n = rb_first(&tree->tree); n; n = rb_next(n)) { 446 n = __tree_search(&tree->tree, end, &prev);
447 if (!n)
448 n = prev;
449 for (; n; n = rb_prev(n)) {
443 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); 450 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
451 if (ordered->file_offset > end)
452 continue;
453 if (entry_end(ordered) <= start)
454 break;
444 if (!list_empty(&ordered->log_list)) 455 if (!list_empty(&ordered->log_list))
445 continue; 456 continue;
446 list_add_tail(&ordered->log_list, logged_list); 457 if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
458 continue;
459 list_add(&ordered->log_list, logged_list);
447 atomic_inc(&ordered->refs); 460 atomic_inc(&ordered->refs);
448 } 461 }
449 spin_unlock_irq(&tree->lock); 462 spin_unlock_irq(&tree->lock);
@@ -472,7 +485,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list,
472 spin_unlock_irq(&log->log_extents_lock[index]); 485 spin_unlock_irq(&log->log_extents_lock[index]);
473} 486}
474 487
475void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) 488void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
489 struct btrfs_root *log, u64 transid)
476{ 490{
477 struct btrfs_ordered_extent *ordered; 491 struct btrfs_ordered_extent *ordered;
478 int index = transid % 2; 492 int index = transid % 2;
@@ -497,7 +511,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
497 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, 511 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
498 &ordered->flags)); 512 &ordered->flags));
499 513
500 btrfs_put_ordered_extent(ordered); 514 if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
515 list_add_tail(&ordered->trans_list, &trans->ordered);
501 spin_lock_irq(&log->log_extents_lock[index]); 516 spin_lock_irq(&log->log_extents_lock[index]);
502 } 517 }
503 spin_unlock_irq(&log->log_extents_lock[index]); 518 spin_unlock_irq(&log->log_extents_lock[index]);
@@ -725,30 +740,10 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
725 /* start IO across the range first to instantiate any delalloc 740 /* start IO across the range first to instantiate any delalloc
726 * extents 741 * extents
727 */ 742 */
728 ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end); 743 ret = btrfs_fdatawrite_range(inode, start, orig_end);
729 if (ret) 744 if (ret)
730 return ret; 745 return ret;
731 /* 746
732 * So with compression we will find and lock a dirty page and clear the
733 * first one as dirty, setup an async extent, and immediately return
734 * with the entire range locked but with nobody actually marked with
735 * writeback. So we can't just filemap_write_and_wait_range() and
736 * expect it to work since it will just kick off a thread to do the
737 * actual work. So we need to call filemap_fdatawrite_range _again_
738 * since it will wait on the page lock, which won't be unlocked until
739 * after the pages have been marked as writeback and so we're good to go
740 * from there. We have to do this otherwise we'll miss the ordered
741 * extents and that results in badness. Please Josef, do not think you
742 * know better and pull this out at some point in the future, it is
743 * right and you are wrong.
744 */
745 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
746 &BTRFS_I(inode)->runtime_flags)) {
747 ret = filemap_fdatawrite_range(inode->i_mapping, start,
748 orig_end);
749 if (ret)
750 return ret;
751 }
752 ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); 747 ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
753 if (ret) 748 if (ret)
754 return ret; 749 return ret;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d81a274d621e..e96cd4ccd805 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -71,6 +71,8 @@ struct btrfs_ordered_sum {
71 ordered extent */ 71 ordered extent */
72#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ 72#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
73 73
74#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
75 * in the logging code. */
74struct btrfs_ordered_extent { 76struct btrfs_ordered_extent {
75 /* logical offset in the file */ 77 /* logical offset in the file */
76 u64 file_offset; 78 u64 file_offset;
@@ -121,6 +123,9 @@ struct btrfs_ordered_extent {
121 /* If we need to wait on this to be done */ 123 /* If we need to wait on this to be done */
122 struct list_head log_list; 124 struct list_head log_list;
123 125
126 /* If the transaction needs to wait on this ordered extent */
127 struct list_head trans_list;
128
124 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ 129 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
125 wait_queue_head_t wait; 130 wait_queue_head_t wait;
126 131
@@ -193,11 +198,14 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
193int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); 198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
194void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); 199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
195void btrfs_get_logged_extents(struct inode *inode, 200void btrfs_get_logged_extents(struct inode *inode,
196 struct list_head *logged_list); 201 struct list_head *logged_list,
202 const loff_t start,
203 const loff_t end);
197void btrfs_put_logged_extents(struct list_head *logged_list); 204void btrfs_put_logged_extents(struct list_head *logged_list);
198void btrfs_submit_logged_extents(struct list_head *logged_list, 205void btrfs_submit_logged_extents(struct list_head *logged_list,
199 struct btrfs_root *log); 206 struct btrfs_root *log);
200void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); 207void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
208 struct btrfs_root *log, u64 transid);
201void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); 209void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
202int __init ordered_data_init(void); 210int __init ordered_data_init(void);
203void ordered_data_exit(void); 211void ordered_data_exit(void);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 6a41631cb959..8ab2a17bbba8 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,9 +58,23 @@
58 */ 58 */
59#define RBIO_CACHE_READY_BIT 3 59#define RBIO_CACHE_READY_BIT 3
60 60
61/*
62 * bbio and raid_map is managed by the caller, so we shouldn't free
63 * them here. And besides that, all rbios with this flag should not
64 * be cached, because we need raid_map to check the rbios' stripe
65 * is the same or not, but it is very likely that the caller has
66 * free raid_map, so don't cache those rbios.
67 */
68#define RBIO_HOLD_BBIO_MAP_BIT 4
61 69
62#define RBIO_CACHE_SIZE 1024 70#define RBIO_CACHE_SIZE 1024
63 71
72enum btrfs_rbio_ops {
73 BTRFS_RBIO_WRITE = 0,
74 BTRFS_RBIO_READ_REBUILD = 1,
75 BTRFS_RBIO_PARITY_SCRUB = 2,
76};
77
64struct btrfs_raid_bio { 78struct btrfs_raid_bio {
65 struct btrfs_fs_info *fs_info; 79 struct btrfs_fs_info *fs_info;
66 struct btrfs_bio *bbio; 80 struct btrfs_bio *bbio;
@@ -117,13 +131,16 @@ struct btrfs_raid_bio {
117 /* number of data stripes (no p/q) */ 131 /* number of data stripes (no p/q) */
118 int nr_data; 132 int nr_data;
119 133
134 int real_stripes;
135
136 int stripe_npages;
120 /* 137 /*
121 * set if we're doing a parity rebuild 138 * set if we're doing a parity rebuild
122 * for a read from higher up, which is handled 139 * for a read from higher up, which is handled
123 * differently from a parity rebuild as part of 140 * differently from a parity rebuild as part of
124 * rmw 141 * rmw
125 */ 142 */
126 int read_rebuild; 143 enum btrfs_rbio_ops operation;
127 144
128 /* first bad stripe */ 145 /* first bad stripe */
129 int faila; 146 int faila;
@@ -131,6 +148,7 @@ struct btrfs_raid_bio {
131 /* second bad stripe (for raid6 use) */ 148 /* second bad stripe (for raid6 use) */
132 int failb; 149 int failb;
133 150
151 int scrubp;
134 /* 152 /*
135 * number of pages needed to represent the full 153 * number of pages needed to represent the full
136 * stripe 154 * stripe
@@ -144,8 +162,13 @@ struct btrfs_raid_bio {
144 */ 162 */
145 int bio_list_bytes; 163 int bio_list_bytes;
146 164
165 int generic_bio_cnt;
166
147 atomic_t refs; 167 atomic_t refs;
148 168
169 atomic_t stripes_pending;
170
171 atomic_t error;
149 /* 172 /*
150 * these are two arrays of pointers. We allocate the 173 * these are two arrays of pointers. We allocate the
151 * rbio big enough to hold them both and setup their 174 * rbio big enough to hold them both and setup their
@@ -162,6 +185,11 @@ struct btrfs_raid_bio {
162 * here for faster lookup 185 * here for faster lookup
163 */ 186 */
164 struct page **bio_pages; 187 struct page **bio_pages;
188
189 /*
190 * bitmap to record which horizontal stripe has data
191 */
192 unsigned long *dbitmap;
165}; 193};
166 194
167static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 195static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
@@ -176,6 +204,10 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio);
176static void index_rbio_pages(struct btrfs_raid_bio *rbio); 204static void index_rbio_pages(struct btrfs_raid_bio *rbio);
177static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 205static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
178 206
207static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
208 int need_check);
209static void async_scrub_parity(struct btrfs_raid_bio *rbio);
210
179/* 211/*
180 * the stripe hash table is used for locking, and to collect 212 * the stripe hash table is used for locking, and to collect
181 * bios in hopes of making a full stripe 213 * bios in hopes of making a full stripe
@@ -324,6 +356,7 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
324{ 356{
325 bio_list_merge(&dest->bio_list, &victim->bio_list); 357 bio_list_merge(&dest->bio_list, &victim->bio_list);
326 dest->bio_list_bytes += victim->bio_list_bytes; 358 dest->bio_list_bytes += victim->bio_list_bytes;
359 dest->generic_bio_cnt += victim->generic_bio_cnt;
327 bio_list_init(&victim->bio_list); 360 bio_list_init(&victim->bio_list);
328} 361}
329 362
@@ -577,11 +610,20 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
577 cur->raid_map[0]) 610 cur->raid_map[0])
578 return 0; 611 return 0;
579 612
580 /* reads can't merge with writes */ 613 /* we can't merge with different operations */
581 if (last->read_rebuild != 614 if (last->operation != cur->operation)
582 cur->read_rebuild) { 615 return 0;
616 /*
617 * We've need read the full stripe from the drive.
618 * check and repair the parity and write the new results.
619 *
620 * We're not allowed to add any new bios to the
621 * bio list here, anyone else that wants to
622 * change this stripe needs to do their own rmw.
623 */
624 if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
625 cur->operation == BTRFS_RBIO_PARITY_SCRUB)
583 return 0; 626 return 0;
584 }
585 627
586 return 1; 628 return 1;
587} 629}
@@ -601,7 +643,7 @@ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
601 */ 643 */
602static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 644static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
603{ 645{
604 if (rbio->nr_data + 1 == rbio->bbio->num_stripes) 646 if (rbio->nr_data + 1 == rbio->real_stripes)
605 return NULL; 647 return NULL;
606 648
607 index += ((rbio->nr_data + 1) * rbio->stripe_len) >> 649 index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
@@ -772,11 +814,14 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
772 spin_unlock(&rbio->bio_list_lock); 814 spin_unlock(&rbio->bio_list_lock);
773 spin_unlock_irqrestore(&h->lock, flags); 815 spin_unlock_irqrestore(&h->lock, flags);
774 816
775 if (next->read_rebuild) 817 if (next->operation == BTRFS_RBIO_READ_REBUILD)
776 async_read_rebuild(next); 818 async_read_rebuild(next);
777 else { 819 else if (next->operation == BTRFS_RBIO_WRITE) {
778 steal_rbio(rbio, next); 820 steal_rbio(rbio, next);
779 async_rmw_stripe(next); 821 async_rmw_stripe(next);
822 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
823 steal_rbio(rbio, next);
824 async_scrub_parity(next);
780 } 825 }
781 826
782 goto done_nolock; 827 goto done_nolock;
@@ -796,6 +841,21 @@ done_nolock:
796 remove_rbio_from_cache(rbio); 841 remove_rbio_from_cache(rbio);
797} 842}
798 843
844static inline void
845__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
846{
847 if (need) {
848 kfree(raid_map);
849 kfree(bbio);
850 }
851}
852
853static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
854{
855 __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
856 !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
857}
858
799static void __free_raid_bio(struct btrfs_raid_bio *rbio) 859static void __free_raid_bio(struct btrfs_raid_bio *rbio)
800{ 860{
801 int i; 861 int i;
@@ -814,8 +874,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
814 rbio->stripe_pages[i] = NULL; 874 rbio->stripe_pages[i] = NULL;
815 } 875 }
816 } 876 }
817 kfree(rbio->raid_map); 877
818 kfree(rbio->bbio); 878 free_bbio_and_raid_map(rbio);
879
819 kfree(rbio); 880 kfree(rbio);
820} 881}
821 882
@@ -833,6 +894,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
833{ 894{
834 struct bio *cur = bio_list_get(&rbio->bio_list); 895 struct bio *cur = bio_list_get(&rbio->bio_list);
835 struct bio *next; 896 struct bio *next;
897
898 if (rbio->generic_bio_cnt)
899 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
900
836 free_raid_bio(rbio); 901 free_raid_bio(rbio);
837 902
838 while (cur) { 903 while (cur) {
@@ -858,13 +923,13 @@ static void raid_write_end_io(struct bio *bio, int err)
858 923
859 bio_put(bio); 924 bio_put(bio);
860 925
861 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 926 if (!atomic_dec_and_test(&rbio->stripes_pending))
862 return; 927 return;
863 928
864 err = 0; 929 err = 0;
865 930
866 /* OK, we have read all the stripes we need to. */ 931 /* OK, we have read all the stripes we need to. */
867 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 932 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
868 err = -EIO; 933 err = -EIO;
869 934
870 rbio_orig_end_io(rbio, err, 0); 935 rbio_orig_end_io(rbio, err, 0);
@@ -925,16 +990,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
925{ 990{
926 struct btrfs_raid_bio *rbio; 991 struct btrfs_raid_bio *rbio;
927 int nr_data = 0; 992 int nr_data = 0;
928 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); 993 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
994 int num_pages = rbio_nr_pages(stripe_len, real_stripes);
995 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
929 void *p; 996 void *p;
930 997
931 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, 998 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
999 DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
932 GFP_NOFS); 1000 GFP_NOFS);
933 if (!rbio) { 1001 if (!rbio)
934 kfree(raid_map);
935 kfree(bbio);
936 return ERR_PTR(-ENOMEM); 1002 return ERR_PTR(-ENOMEM);
937 }
938 1003
939 bio_list_init(&rbio->bio_list); 1004 bio_list_init(&rbio->bio_list);
940 INIT_LIST_HEAD(&rbio->plug_list); 1005 INIT_LIST_HEAD(&rbio->plug_list);
@@ -946,9 +1011,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
946 rbio->fs_info = root->fs_info; 1011 rbio->fs_info = root->fs_info;
947 rbio->stripe_len = stripe_len; 1012 rbio->stripe_len = stripe_len;
948 rbio->nr_pages = num_pages; 1013 rbio->nr_pages = num_pages;
1014 rbio->real_stripes = real_stripes;
1015 rbio->stripe_npages = stripe_npages;
949 rbio->faila = -1; 1016 rbio->faila = -1;
950 rbio->failb = -1; 1017 rbio->failb = -1;
951 atomic_set(&rbio->refs, 1); 1018 atomic_set(&rbio->refs, 1);
1019 atomic_set(&rbio->error, 0);
1020 atomic_set(&rbio->stripes_pending, 0);
952 1021
953 /* 1022 /*
954 * the stripe_pages and bio_pages array point to the extra 1023 * the stripe_pages and bio_pages array point to the extra
@@ -957,11 +1026,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
957 p = rbio + 1; 1026 p = rbio + 1;
958 rbio->stripe_pages = p; 1027 rbio->stripe_pages = p;
959 rbio->bio_pages = p + sizeof(struct page *) * num_pages; 1028 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
1029 rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
960 1030
961 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) 1031 if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
962 nr_data = bbio->num_stripes - 2; 1032 nr_data = real_stripes - 2;
963 else 1033 else
964 nr_data = bbio->num_stripes - 1; 1034 nr_data = real_stripes - 1;
965 1035
966 rbio->nr_data = nr_data; 1036 rbio->nr_data = nr_data;
967 return rbio; 1037 return rbio;
@@ -1073,7 +1143,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1073static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1143static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1074{ 1144{
1075 if (rbio->faila >= 0 || rbio->failb >= 0) { 1145 if (rbio->faila >= 0 || rbio->failb >= 0) {
1076 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); 1146 BUG_ON(rbio->faila == rbio->real_stripes - 1);
1077 __raid56_parity_recover(rbio); 1147 __raid56_parity_recover(rbio);
1078 } else { 1148 } else {
1079 finish_rmw(rbio); 1149 finish_rmw(rbio);
@@ -1134,7 +1204,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1134static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1204static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1135{ 1205{
1136 struct btrfs_bio *bbio = rbio->bbio; 1206 struct btrfs_bio *bbio = rbio->bbio;
1137 void *pointers[bbio->num_stripes]; 1207 void *pointers[rbio->real_stripes];
1138 int stripe_len = rbio->stripe_len; 1208 int stripe_len = rbio->stripe_len;
1139 int nr_data = rbio->nr_data; 1209 int nr_data = rbio->nr_data;
1140 int stripe; 1210 int stripe;
@@ -1148,11 +1218,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1148 1218
1149 bio_list_init(&bio_list); 1219 bio_list_init(&bio_list);
1150 1220
1151 if (bbio->num_stripes - rbio->nr_data == 1) { 1221 if (rbio->real_stripes - rbio->nr_data == 1) {
1152 p_stripe = bbio->num_stripes - 1; 1222 p_stripe = rbio->real_stripes - 1;
1153 } else if (bbio->num_stripes - rbio->nr_data == 2) { 1223 } else if (rbio->real_stripes - rbio->nr_data == 2) {
1154 p_stripe = bbio->num_stripes - 2; 1224 p_stripe = rbio->real_stripes - 2;
1155 q_stripe = bbio->num_stripes - 1; 1225 q_stripe = rbio->real_stripes - 1;
1156 } else { 1226 } else {
1157 BUG(); 1227 BUG();
1158 } 1228 }
@@ -1169,7 +1239,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1169 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1239 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1170 spin_unlock_irq(&rbio->bio_list_lock); 1240 spin_unlock_irq(&rbio->bio_list_lock);
1171 1241
1172 atomic_set(&rbio->bbio->error, 0); 1242 atomic_set(&rbio->error, 0);
1173 1243
1174 /* 1244 /*
1175 * now that we've set rmw_locked, run through the 1245 * now that we've set rmw_locked, run through the
@@ -1209,7 +1279,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1209 SetPageUptodate(p); 1279 SetPageUptodate(p);
1210 pointers[stripe++] = kmap(p); 1280 pointers[stripe++] = kmap(p);
1211 1281
1212 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, 1282 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1213 pointers); 1283 pointers);
1214 } else { 1284 } else {
1215 /* raid5 */ 1285 /* raid5 */
@@ -1218,7 +1288,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1218 } 1288 }
1219 1289
1220 1290
1221 for (stripe = 0; stripe < bbio->num_stripes; stripe++) 1291 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
1222 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 1292 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1223 } 1293 }
1224 1294
@@ -1227,7 +1297,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1227 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1297 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1228 * everything else. 1298 * everything else.
1229 */ 1299 */
1230 for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 1300 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1231 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1301 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1232 struct page *page; 1302 struct page *page;
1233 if (stripe < rbio->nr_data) { 1303 if (stripe < rbio->nr_data) {
@@ -1245,8 +1315,34 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1245 } 1315 }
1246 } 1316 }
1247 1317
1248 atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); 1318 if (likely(!bbio->num_tgtdevs))
1249 BUG_ON(atomic_read(&bbio->stripes_pending) == 0); 1319 goto write_data;
1320
1321 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1322 if (!bbio->tgtdev_map[stripe])
1323 continue;
1324
1325 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1326 struct page *page;
1327 if (stripe < rbio->nr_data) {
1328 page = page_in_rbio(rbio, stripe, pagenr, 1);
1329 if (!page)
1330 continue;
1331 } else {
1332 page = rbio_stripe_page(rbio, stripe, pagenr);
1333 }
1334
1335 ret = rbio_add_io_page(rbio, &bio_list, page,
1336 rbio->bbio->tgtdev_map[stripe],
1337 pagenr, rbio->stripe_len);
1338 if (ret)
1339 goto cleanup;
1340 }
1341 }
1342
1343write_data:
1344 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1345 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1250 1346
1251 while (1) { 1347 while (1) {
1252 bio = bio_list_pop(&bio_list); 1348 bio = bio_list_pop(&bio_list);
@@ -1283,7 +1379,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1283 stripe = &rbio->bbio->stripes[i]; 1379 stripe = &rbio->bbio->stripes[i];
1284 stripe_start = stripe->physical; 1380 stripe_start = stripe->physical;
1285 if (physical >= stripe_start && 1381 if (physical >= stripe_start &&
1286 physical < stripe_start + rbio->stripe_len) { 1382 physical < stripe_start + rbio->stripe_len &&
1383 bio->bi_bdev == stripe->dev->bdev) {
1287 return i; 1384 return i;
1288 } 1385 }
1289 } 1386 }
@@ -1331,11 +1428,11 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1331 if (rbio->faila == -1) { 1428 if (rbio->faila == -1) {
1332 /* first failure on this rbio */ 1429 /* first failure on this rbio */
1333 rbio->faila = failed; 1430 rbio->faila = failed;
1334 atomic_inc(&rbio->bbio->error); 1431 atomic_inc(&rbio->error);
1335 } else if (rbio->failb == -1) { 1432 } else if (rbio->failb == -1) {
1336 /* second failure on this rbio */ 1433 /* second failure on this rbio */
1337 rbio->failb = failed; 1434 rbio->failb = failed;
1338 atomic_inc(&rbio->bbio->error); 1435 atomic_inc(&rbio->error);
1339 } else { 1436 } else {
1340 ret = -EIO; 1437 ret = -EIO;
1341 } 1438 }
@@ -1394,11 +1491,11 @@ static void raid_rmw_end_io(struct bio *bio, int err)
1394 1491
1395 bio_put(bio); 1492 bio_put(bio);
1396 1493
1397 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 1494 if (!atomic_dec_and_test(&rbio->stripes_pending))
1398 return; 1495 return;
1399 1496
1400 err = 0; 1497 err = 0;
1401 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 1498 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1402 goto cleanup; 1499 goto cleanup;
1403 1500
1404 /* 1501 /*
@@ -1439,7 +1536,6 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1439static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1536static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1440{ 1537{
1441 int bios_to_read = 0; 1538 int bios_to_read = 0;
1442 struct btrfs_bio *bbio = rbio->bbio;
1443 struct bio_list bio_list; 1539 struct bio_list bio_list;
1444 int ret; 1540 int ret;
1445 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 1541 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1455,7 +1551,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1455 1551
1456 index_rbio_pages(rbio); 1552 index_rbio_pages(rbio);
1457 1553
1458 atomic_set(&rbio->bbio->error, 0); 1554 atomic_set(&rbio->error, 0);
1459 /* 1555 /*
1460 * build a list of bios to read all the missing parts of this 1556 * build a list of bios to read all the missing parts of this
1461 * stripe 1557 * stripe
@@ -1503,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1503 * the bbio may be freed once we submit the last bio. Make sure 1599 * the bbio may be freed once we submit the last bio. Make sure
1504 * not to touch it after that 1600 * not to touch it after that
1505 */ 1601 */
1506 atomic_set(&bbio->stripes_pending, bios_to_read); 1602 atomic_set(&rbio->stripes_pending, bios_to_read);
1507 while (1) { 1603 while (1) {
1508 bio = bio_list_pop(&bio_list); 1604 bio = bio_list_pop(&bio_list);
1509 if (!bio) 1605 if (!bio)
@@ -1686,19 +1782,30 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1686 struct btrfs_raid_bio *rbio; 1782 struct btrfs_raid_bio *rbio;
1687 struct btrfs_plug_cb *plug = NULL; 1783 struct btrfs_plug_cb *plug = NULL;
1688 struct blk_plug_cb *cb; 1784 struct blk_plug_cb *cb;
1785 int ret;
1689 1786
1690 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 1787 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1691 if (IS_ERR(rbio)) 1788 if (IS_ERR(rbio)) {
1789 __free_bbio_and_raid_map(bbio, raid_map, 1);
1692 return PTR_ERR(rbio); 1790 return PTR_ERR(rbio);
1791 }
1693 bio_list_add(&rbio->bio_list, bio); 1792 bio_list_add(&rbio->bio_list, bio);
1694 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1793 rbio->bio_list_bytes = bio->bi_iter.bi_size;
1794 rbio->operation = BTRFS_RBIO_WRITE;
1795
1796 btrfs_bio_counter_inc_noblocked(root->fs_info);
1797 rbio->generic_bio_cnt = 1;
1695 1798
1696 /* 1799 /*
1697 * don't plug on full rbios, just get them out the door 1800 * don't plug on full rbios, just get them out the door
1698 * as quickly as we can 1801 * as quickly as we can
1699 */ 1802 */
1700 if (rbio_is_full(rbio)) 1803 if (rbio_is_full(rbio)) {
1701 return full_stripe_write(rbio); 1804 ret = full_stripe_write(rbio);
1805 if (ret)
1806 btrfs_bio_counter_dec(root->fs_info);
1807 return ret;
1808 }
1702 1809
1703 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, 1810 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1704 sizeof(*plug)); 1811 sizeof(*plug));
@@ -1709,10 +1816,13 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1709 INIT_LIST_HEAD(&plug->rbio_list); 1816 INIT_LIST_HEAD(&plug->rbio_list);
1710 } 1817 }
1711 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1818 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1819 ret = 0;
1712 } else { 1820 } else {
1713 return __raid56_parity_write(rbio); 1821 ret = __raid56_parity_write(rbio);
1822 if (ret)
1823 btrfs_bio_counter_dec(root->fs_info);
1714 } 1824 }
1715 return 0; 1825 return ret;
1716} 1826}
1717 1827
1718/* 1828/*
@@ -1730,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1730 int err; 1840 int err;
1731 int i; 1841 int i;
1732 1842
1733 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), 1843 pointers = kzalloc(rbio->real_stripes * sizeof(void *),
1734 GFP_NOFS); 1844 GFP_NOFS);
1735 if (!pointers) { 1845 if (!pointers) {
1736 err = -ENOMEM; 1846 err = -ENOMEM;
@@ -1740,7 +1850,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1740 faila = rbio->faila; 1850 faila = rbio->faila;
1741 failb = rbio->failb; 1851 failb = rbio->failb;
1742 1852
1743 if (rbio->read_rebuild) { 1853 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1744 spin_lock_irq(&rbio->bio_list_lock); 1854 spin_lock_irq(&rbio->bio_list_lock);
1745 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1855 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1746 spin_unlock_irq(&rbio->bio_list_lock); 1856 spin_unlock_irq(&rbio->bio_list_lock);
@@ -1749,15 +1859,23 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1749 index_rbio_pages(rbio); 1859 index_rbio_pages(rbio);
1750 1860
1751 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1861 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1862 /*
1863 * Now we just use bitmap to mark the horizontal stripes in
1864 * which we have data when doing parity scrub.
1865 */
1866 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1867 !test_bit(pagenr, rbio->dbitmap))
1868 continue;
1869
1752 /* setup our array of pointers with pages 1870 /* setup our array of pointers with pages
1753 * from each stripe 1871 * from each stripe
1754 */ 1872 */
1755 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1873 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1756 /* 1874 /*
1757 * if we're rebuilding a read, we have to use 1875 * if we're rebuilding a read, we have to use
1758 * pages from the bio list 1876 * pages from the bio list
1759 */ 1877 */
1760 if (rbio->read_rebuild && 1878 if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1761 (stripe == faila || stripe == failb)) { 1879 (stripe == faila || stripe == failb)) {
1762 page = page_in_rbio(rbio, stripe, pagenr, 0); 1880 page = page_in_rbio(rbio, stripe, pagenr, 0);
1763 } else { 1881 } else {
@@ -1767,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1767 } 1885 }
1768 1886
1769 /* all raid6 handling here */ 1887 /* all raid6 handling here */
1770 if (rbio->raid_map[rbio->bbio->num_stripes - 1] == 1888 if (rbio->raid_map[rbio->real_stripes - 1] ==
1771 RAID6_Q_STRIPE) { 1889 RAID6_Q_STRIPE) {
1772 1890
1773 /* 1891 /*
@@ -1817,10 +1935,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1817 } 1935 }
1818 1936
1819 if (rbio->raid_map[failb] == RAID5_P_STRIPE) { 1937 if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1820 raid6_datap_recov(rbio->bbio->num_stripes, 1938 raid6_datap_recov(rbio->real_stripes,
1821 PAGE_SIZE, faila, pointers); 1939 PAGE_SIZE, faila, pointers);
1822 } else { 1940 } else {
1823 raid6_2data_recov(rbio->bbio->num_stripes, 1941 raid6_2data_recov(rbio->real_stripes,
1824 PAGE_SIZE, faila, failb, 1942 PAGE_SIZE, faila, failb,
1825 pointers); 1943 pointers);
1826 } 1944 }
@@ -1850,7 +1968,7 @@ pstripe:
1850 * know they can be trusted. If this was a read reconstruction, 1968 * know they can be trusted. If this was a read reconstruction,
1851 * other endio functions will fiddle the uptodate bits 1969 * other endio functions will fiddle the uptodate bits
1852 */ 1970 */
1853 if (!rbio->read_rebuild) { 1971 if (rbio->operation == BTRFS_RBIO_WRITE) {
1854 for (i = 0; i < nr_pages; i++) { 1972 for (i = 0; i < nr_pages; i++) {
1855 if (faila != -1) { 1973 if (faila != -1) {
1856 page = rbio_stripe_page(rbio, faila, i); 1974 page = rbio_stripe_page(rbio, faila, i);
@@ -1862,12 +1980,12 @@ pstripe:
1862 } 1980 }
1863 } 1981 }
1864 } 1982 }
1865 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1983 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1866 /* 1984 /*
1867 * if we're rebuilding a read, we have to use 1985 * if we're rebuilding a read, we have to use
1868 * pages from the bio list 1986 * pages from the bio list
1869 */ 1987 */
1870 if (rbio->read_rebuild && 1988 if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1871 (stripe == faila || stripe == failb)) { 1989 (stripe == faila || stripe == failb)) {
1872 page = page_in_rbio(rbio, stripe, pagenr, 0); 1990 page = page_in_rbio(rbio, stripe, pagenr, 0);
1873 } else { 1991 } else {
@@ -1882,9 +2000,9 @@ cleanup:
1882 kfree(pointers); 2000 kfree(pointers);
1883 2001
1884cleanup_io: 2002cleanup_io:
1885 2003 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1886 if (rbio->read_rebuild) { 2004 if (err == 0 &&
1887 if (err == 0) 2005 !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
1888 cache_rbio_pages(rbio); 2006 cache_rbio_pages(rbio);
1889 else 2007 else
1890 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2008 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -1893,7 +2011,13 @@ cleanup_io:
1893 } else if (err == 0) { 2011 } else if (err == 0) {
1894 rbio->faila = -1; 2012 rbio->faila = -1;
1895 rbio->failb = -1; 2013 rbio->failb = -1;
1896 finish_rmw(rbio); 2014
2015 if (rbio->operation == BTRFS_RBIO_WRITE)
2016 finish_rmw(rbio);
2017 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2018 finish_parity_scrub(rbio, 0);
2019 else
2020 BUG();
1897 } else { 2021 } else {
1898 rbio_orig_end_io(rbio, err, 0); 2022 rbio_orig_end_io(rbio, err, 0);
1899 } 2023 }
@@ -1917,10 +2041,10 @@ static void raid_recover_end_io(struct bio *bio, int err)
1917 set_bio_pages_uptodate(bio); 2041 set_bio_pages_uptodate(bio);
1918 bio_put(bio); 2042 bio_put(bio);
1919 2043
1920 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 2044 if (!atomic_dec_and_test(&rbio->stripes_pending))
1921 return; 2045 return;
1922 2046
1923 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 2047 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1924 rbio_orig_end_io(rbio, -EIO, 0); 2048 rbio_orig_end_io(rbio, -EIO, 0);
1925 else 2049 else
1926 __raid_recover_end_io(rbio); 2050 __raid_recover_end_io(rbio);
@@ -1937,7 +2061,6 @@ static void raid_recover_end_io(struct bio *bio, int err)
1937static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2061static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1938{ 2062{
1939 int bios_to_read = 0; 2063 int bios_to_read = 0;
1940 struct btrfs_bio *bbio = rbio->bbio;
1941 struct bio_list bio_list; 2064 struct bio_list bio_list;
1942 int ret; 2065 int ret;
1943 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 2066 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1951,16 +2074,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1951 if (ret) 2074 if (ret)
1952 goto cleanup; 2075 goto cleanup;
1953 2076
1954 atomic_set(&rbio->bbio->error, 0); 2077 atomic_set(&rbio->error, 0);
1955 2078
1956 /* 2079 /*
1957 * read everything that hasn't failed. Thanks to the 2080 * read everything that hasn't failed. Thanks to the
1958 * stripe cache, it is possible that some or all of these 2081 * stripe cache, it is possible that some or all of these
1959 * pages are going to be uptodate. 2082 * pages are going to be uptodate.
1960 */ 2083 */
1961 for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 2084 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1962 if (rbio->faila == stripe || rbio->failb == stripe) { 2085 if (rbio->faila == stripe || rbio->failb == stripe) {
1963 atomic_inc(&rbio->bbio->error); 2086 atomic_inc(&rbio->error);
1964 continue; 2087 continue;
1965 } 2088 }
1966 2089
@@ -1990,7 +2113,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1990 * were up to date, or we might have no bios to read because 2113 * were up to date, or we might have no bios to read because
1991 * the devices were gone. 2114 * the devices were gone.
1992 */ 2115 */
1993 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { 2116 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
1994 __raid_recover_end_io(rbio); 2117 __raid_recover_end_io(rbio);
1995 goto out; 2118 goto out;
1996 } else { 2119 } else {
@@ -2002,7 +2125,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2002 * the bbio may be freed once we submit the last bio. Make sure 2125 * the bbio may be freed once we submit the last bio. Make sure
2003 * not to touch it after that 2126 * not to touch it after that
2004 */ 2127 */
2005 atomic_set(&bbio->stripes_pending, bios_to_read); 2128 atomic_set(&rbio->stripes_pending, bios_to_read);
2006 while (1) { 2129 while (1) {
2007 bio = bio_list_pop(&bio_list); 2130 bio = bio_list_pop(&bio_list);
2008 if (!bio) 2131 if (!bio)
@@ -2021,7 +2144,7 @@ out:
2021 return 0; 2144 return 0;
2022 2145
2023cleanup: 2146cleanup:
2024 if (rbio->read_rebuild) 2147 if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
2025 rbio_orig_end_io(rbio, -EIO, 0); 2148 rbio_orig_end_io(rbio, -EIO, 0);
2026 return -EIO; 2149 return -EIO;
2027} 2150}
@@ -2034,34 +2157,42 @@ cleanup:
2034 */ 2157 */
2035int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 2158int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2036 struct btrfs_bio *bbio, u64 *raid_map, 2159 struct btrfs_bio *bbio, u64 *raid_map,
2037 u64 stripe_len, int mirror_num) 2160 u64 stripe_len, int mirror_num, int generic_io)
2038{ 2161{
2039 struct btrfs_raid_bio *rbio; 2162 struct btrfs_raid_bio *rbio;
2040 int ret; 2163 int ret;
2041 2164
2042 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 2165 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2043 if (IS_ERR(rbio)) 2166 if (IS_ERR(rbio)) {
2167 __free_bbio_and_raid_map(bbio, raid_map, generic_io);
2044 return PTR_ERR(rbio); 2168 return PTR_ERR(rbio);
2169 }
2045 2170
2046 rbio->read_rebuild = 1; 2171 rbio->operation = BTRFS_RBIO_READ_REBUILD;
2047 bio_list_add(&rbio->bio_list, bio); 2172 bio_list_add(&rbio->bio_list, bio);
2048 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2173 rbio->bio_list_bytes = bio->bi_iter.bi_size;
2049 2174
2050 rbio->faila = find_logical_bio_stripe(rbio, bio); 2175 rbio->faila = find_logical_bio_stripe(rbio, bio);
2051 if (rbio->faila == -1) { 2176 if (rbio->faila == -1) {
2052 BUG(); 2177 BUG();
2053 kfree(raid_map); 2178 __free_bbio_and_raid_map(bbio, raid_map, generic_io);
2054 kfree(bbio);
2055 kfree(rbio); 2179 kfree(rbio);
2056 return -EIO; 2180 return -EIO;
2057 } 2181 }
2058 2182
2183 if (generic_io) {
2184 btrfs_bio_counter_inc_noblocked(root->fs_info);
2185 rbio->generic_bio_cnt = 1;
2186 } else {
2187 set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
2188 }
2189
2059 /* 2190 /*
2060 * reconstruct from the q stripe if they are 2191 * reconstruct from the q stripe if they are
2061 * asking for mirror 3 2192 * asking for mirror 3
2062 */ 2193 */
2063 if (mirror_num == 3) 2194 if (mirror_num == 3)
2064 rbio->failb = bbio->num_stripes - 2; 2195 rbio->failb = rbio->real_stripes - 2;
2065 2196
2066 ret = lock_stripe_add(rbio); 2197 ret = lock_stripe_add(rbio);
2067 2198
@@ -2098,3 +2229,483 @@ static void read_rebuild_work(struct btrfs_work *work)
2098 rbio = container_of(work, struct btrfs_raid_bio, work); 2229 rbio = container_of(work, struct btrfs_raid_bio, work);
2099 __raid56_parity_recover(rbio); 2230 __raid56_parity_recover(rbio);
2100} 2231}
2232
2233/*
2234 * The following code is used to scrub/replace the parity stripe
2235 *
2236 * Note: We need make sure all the pages that add into the scrub/replace
2237 * raid bio are correct and not be changed during the scrub/replace. That
2238 * is those pages just hold metadata or file data with checksum.
2239 */
2240
2241struct btrfs_raid_bio *
2242raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
2243 struct btrfs_bio *bbio, u64 *raid_map,
2244 u64 stripe_len, struct btrfs_device *scrub_dev,
2245 unsigned long *dbitmap, int stripe_nsectors)
2246{
2247 struct btrfs_raid_bio *rbio;
2248 int i;
2249
2250 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2251 if (IS_ERR(rbio))
2252 return NULL;
2253 bio_list_add(&rbio->bio_list, bio);
2254 /*
2255 * This is a special bio which is used to hold the completion handler
2256 * and make the scrub rbio is similar to the other types
2257 */
2258 ASSERT(!bio->bi_iter.bi_size);
2259 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2260
2261 for (i = 0; i < rbio->real_stripes; i++) {
2262 if (bbio->stripes[i].dev == scrub_dev) {
2263 rbio->scrubp = i;
2264 break;
2265 }
2266 }
2267
2268 /* Now we just support the sectorsize equals to page size */
2269 ASSERT(root->sectorsize == PAGE_SIZE);
2270 ASSERT(rbio->stripe_npages == stripe_nsectors);
2271 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2272
2273 return rbio;
2274}
2275
2276void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
2277 struct page *page, u64 logical)
2278{
2279 int stripe_offset;
2280 int index;
2281
2282 ASSERT(logical >= rbio->raid_map[0]);
2283 ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
2284 rbio->stripe_len * rbio->nr_data);
2285 stripe_offset = (int)(logical - rbio->raid_map[0]);
2286 index = stripe_offset >> PAGE_CACHE_SHIFT;
2287 rbio->bio_pages[index] = page;
2288}
2289
2290/*
2291 * We just scrub the parity that we have correct data on the same horizontal,
2292 * so we needn't allocate all pages for all the stripes.
2293 */
2294static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2295{
2296 int i;
2297 int bit;
2298 int index;
2299 struct page *page;
2300
2301 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2302 for (i = 0; i < rbio->real_stripes; i++) {
2303 index = i * rbio->stripe_npages + bit;
2304 if (rbio->stripe_pages[index])
2305 continue;
2306
2307 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2308 if (!page)
2309 return -ENOMEM;
2310 rbio->stripe_pages[index] = page;
2311 ClearPageUptodate(page);
2312 }
2313 }
2314 return 0;
2315}
2316
2317/*
2318 * end io function used by finish_rmw. When we finally
2319 * get here, we've written a full stripe
2320 */
2321static void raid_write_parity_end_io(struct bio *bio, int err)
2322{
2323 struct btrfs_raid_bio *rbio = bio->bi_private;
2324
2325 if (err)
2326 fail_bio_stripe(rbio, bio);
2327
2328 bio_put(bio);
2329
2330 if (!atomic_dec_and_test(&rbio->stripes_pending))
2331 return;
2332
2333 err = 0;
2334
2335 if (atomic_read(&rbio->error))
2336 err = -EIO;
2337
2338 rbio_orig_end_io(rbio, err, 0);
2339}
2340
2341static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2342 int need_check)
2343{
2344 struct btrfs_bio *bbio = rbio->bbio;
2345 void *pointers[rbio->real_stripes];
2346 DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
2347 int nr_data = rbio->nr_data;
2348 int stripe;
2349 int pagenr;
2350 int p_stripe = -1;
2351 int q_stripe = -1;
2352 struct page *p_page = NULL;
2353 struct page *q_page = NULL;
2354 struct bio_list bio_list;
2355 struct bio *bio;
2356 int is_replace = 0;
2357 int ret;
2358
2359 bio_list_init(&bio_list);
2360
2361 if (rbio->real_stripes - rbio->nr_data == 1) {
2362 p_stripe = rbio->real_stripes - 1;
2363 } else if (rbio->real_stripes - rbio->nr_data == 2) {
2364 p_stripe = rbio->real_stripes - 2;
2365 q_stripe = rbio->real_stripes - 1;
2366 } else {
2367 BUG();
2368 }
2369
2370 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
2371 is_replace = 1;
2372 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2373 }
2374
2375 /*
2376 * Because the higher layers(scrubber) are unlikely to
2377 * use this area of the disk again soon, so don't cache
2378 * it.
2379 */
2380 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2381
2382 if (!need_check)
2383 goto writeback;
2384
2385 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2386 if (!p_page)
2387 goto cleanup;
2388 SetPageUptodate(p_page);
2389
2390 if (q_stripe != -1) {
2391 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2392 if (!q_page) {
2393 __free_page(p_page);
2394 goto cleanup;
2395 }
2396 SetPageUptodate(q_page);
2397 }
2398
2399 atomic_set(&rbio->error, 0);
2400
2401 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2402 struct page *p;
2403 void *parity;
2404 /* first collect one page from each data stripe */
2405 for (stripe = 0; stripe < nr_data; stripe++) {
2406 p = page_in_rbio(rbio, stripe, pagenr, 0);
2407 pointers[stripe] = kmap(p);
2408 }
2409
2410 /* then add the parity stripe */
2411 pointers[stripe++] = kmap(p_page);
2412
2413 if (q_stripe != -1) {
2414
2415 /*
2416 * raid6, add the qstripe and call the
2417 * library function to fill in our p/q
2418 */
2419 pointers[stripe++] = kmap(q_page);
2420
2421 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2422 pointers);
2423 } else {
2424 /* raid5 */
2425 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
2426 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
2427 }
2428
2429 /* Check scrubbing pairty and repair it */
2430 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2431 parity = kmap(p);
2432 if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
2433 memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
2434 else
2435 /* Parity is right, needn't writeback */
2436 bitmap_clear(rbio->dbitmap, pagenr, 1);
2437 kunmap(p);
2438
2439 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
2440 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
2441 }
2442
2443 __free_page(p_page);
2444 if (q_page)
2445 __free_page(q_page);
2446
2447writeback:
2448 /*
2449 * time to start writing. Make bios for everything from the
2450 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2451 * everything else.
2452 */
2453 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2454 struct page *page;
2455
2456 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2457 ret = rbio_add_io_page(rbio, &bio_list,
2458 page, rbio->scrubp, pagenr, rbio->stripe_len);
2459 if (ret)
2460 goto cleanup;
2461 }
2462
2463 if (!is_replace)
2464 goto submit_write;
2465
2466 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2467 struct page *page;
2468
2469 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2470 ret = rbio_add_io_page(rbio, &bio_list, page,
2471 bbio->tgtdev_map[rbio->scrubp],
2472 pagenr, rbio->stripe_len);
2473 if (ret)
2474 goto cleanup;
2475 }
2476
2477submit_write:
2478 nr_data = bio_list_size(&bio_list);
2479 if (!nr_data) {
2480 /* Every parity is right */
2481 rbio_orig_end_io(rbio, 0, 0);
2482 return;
2483 }
2484
2485 atomic_set(&rbio->stripes_pending, nr_data);
2486
2487 while (1) {
2488 bio = bio_list_pop(&bio_list);
2489 if (!bio)
2490 break;
2491
2492 bio->bi_private = rbio;
2493 bio->bi_end_io = raid_write_parity_end_io;
2494 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2495 submit_bio(WRITE, bio);
2496 }
2497 return;
2498
2499cleanup:
2500 rbio_orig_end_io(rbio, -EIO, 0);
2501}
2502
2503static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2504{
2505 if (stripe >= 0 && stripe < rbio->nr_data)
2506 return 1;
2507 return 0;
2508}
2509
2510/*
2511 * While we're doing the parity check and repair, we could have errors
2512 * in reading pages off the disk. This checks for errors and if we're
2513 * not able to read the page it'll trigger parity reconstruction. The
2514 * parity scrub will be finished after we've reconstructed the failed
2515 * stripes
2516 */
2517static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2518{
2519 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2520 goto cleanup;
2521
2522 if (rbio->faila >= 0 || rbio->failb >= 0) {
2523 int dfail = 0, failp = -1;
2524
2525 if (is_data_stripe(rbio, rbio->faila))
2526 dfail++;
2527 else if (is_parity_stripe(rbio->faila))
2528 failp = rbio->faila;
2529
2530 if (is_data_stripe(rbio, rbio->failb))
2531 dfail++;
2532 else if (is_parity_stripe(rbio->failb))
2533 failp = rbio->failb;
2534
2535 /*
2536 * Because we can not use a scrubbing parity to repair
2537 * the data, so the capability of the repair is declined.
2538 * (In the case of RAID5, we can not repair anything)
2539 */
2540 if (dfail > rbio->bbio->max_errors - 1)
2541 goto cleanup;
2542
2543 /*
2544 * If all data is good, only parity is correctly, just
2545 * repair the parity.
2546 */
2547 if (dfail == 0) {
2548 finish_parity_scrub(rbio, 0);
2549 return;
2550 }
2551
2552 /*
2553 * Here means we got one corrupted data stripe and one
2554 * corrupted parity on RAID6, if the corrupted parity
2555 * is scrubbing parity, luckly, use the other one to repair
2556 * the data, or we can not repair the data stripe.
2557 */
2558 if (failp != rbio->scrubp)
2559 goto cleanup;
2560
2561 __raid_recover_end_io(rbio);
2562 } else {
2563 finish_parity_scrub(rbio, 1);
2564 }
2565 return;
2566
2567cleanup:
2568 rbio_orig_end_io(rbio, -EIO, 0);
2569}
2570
2571/*
2572 * end io for the read phase of the rmw cycle. All the bios here are physical
2573 * stripe bios we've read from the disk so we can recalculate the parity of the
2574 * stripe.
2575 *
2576 * This will usually kick off finish_rmw once all the bios are read in, but it
2577 * may trigger parity reconstruction if we had any errors along the way
2578 */
2579static void raid56_parity_scrub_end_io(struct bio *bio, int err)
2580{
2581 struct btrfs_raid_bio *rbio = bio->bi_private;
2582
2583 if (err)
2584 fail_bio_stripe(rbio, bio);
2585 else
2586 set_bio_pages_uptodate(bio);
2587
2588 bio_put(bio);
2589
2590 if (!atomic_dec_and_test(&rbio->stripes_pending))
2591 return;
2592
2593 /*
2594 * this will normally call finish_rmw to start our write
2595 * but if there are any failed stripes we'll reconstruct
2596 * from parity first
2597 */
2598 validate_rbio_for_parity_scrub(rbio);
2599}
2600
2601static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2602{
2603 int bios_to_read = 0;
2604 struct bio_list bio_list;
2605 int ret;
2606 int pagenr;
2607 int stripe;
2608 struct bio *bio;
2609
2610 ret = alloc_rbio_essential_pages(rbio);
2611 if (ret)
2612 goto cleanup;
2613
2614 bio_list_init(&bio_list);
2615
2616 atomic_set(&rbio->error, 0);
2617 /*
2618 * build a list of bios to read all the missing parts of this
2619 * stripe
2620 */
2621 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2622 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2623 struct page *page;
2624 /*
2625 * we want to find all the pages missing from
2626 * the rbio and read them from the disk. If
2627 * page_in_rbio finds a page in the bio list
2628 * we don't need to read it off the stripe.
2629 */
2630 page = page_in_rbio(rbio, stripe, pagenr, 1);
2631 if (page)
2632 continue;
2633
2634 page = rbio_stripe_page(rbio, stripe, pagenr);
2635 /*
2636 * the bio cache may have handed us an uptodate
2637 * page. If so, be happy and use it
2638 */
2639 if (PageUptodate(page))
2640 continue;
2641
2642 ret = rbio_add_io_page(rbio, &bio_list, page,
2643 stripe, pagenr, rbio->stripe_len);
2644 if (ret)
2645 goto cleanup;
2646 }
2647 }
2648
2649 bios_to_read = bio_list_size(&bio_list);
2650 if (!bios_to_read) {
2651 /*
2652 * this can happen if others have merged with
2653 * us, it means there is nothing left to read.
2654 * But if there are missing devices it may not be
2655 * safe to do the full stripe write yet.
2656 */
2657 goto finish;
2658 }
2659
2660 /*
2661 * the bbio may be freed once we submit the last bio. Make sure
2662 * not to touch it after that
2663 */
2664 atomic_set(&rbio->stripes_pending, bios_to_read);
2665 while (1) {
2666 bio = bio_list_pop(&bio_list);
2667 if (!bio)
2668 break;
2669
2670 bio->bi_private = rbio;
2671 bio->bi_end_io = raid56_parity_scrub_end_io;
2672
2673 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2674 BTRFS_WQ_ENDIO_RAID56);
2675
2676 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2677 submit_bio(READ, bio);
2678 }
2679 /* the actual write will happen once the reads are done */
2680 return;
2681
2682cleanup:
2683 rbio_orig_end_io(rbio, -EIO, 0);
2684 return;
2685
2686finish:
2687 validate_rbio_for_parity_scrub(rbio);
2688}
2689
2690static void scrub_parity_work(struct btrfs_work *work)
2691{
2692 struct btrfs_raid_bio *rbio;
2693
2694 rbio = container_of(work, struct btrfs_raid_bio, work);
2695 raid56_parity_scrub_stripe(rbio);
2696}
2697
2698static void async_scrub_parity(struct btrfs_raid_bio *rbio)
2699{
2700 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
2701 scrub_parity_work, NULL, NULL);
2702
2703 btrfs_queue_work(rbio->fs_info->rmw_workers,
2704 &rbio->work);
2705}
2706
2707void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2708{
2709 if (!lock_stripe_add(rbio))
2710 async_scrub_parity(rbio);
2711}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index ea5d73bfdfbe..31d4a157b5e3 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -39,13 +39,25 @@ static inline int nr_data_stripes(struct map_lookup *map)
39#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ 39#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
40 ((x) == RAID6_Q_STRIPE)) 40 ((x) == RAID6_Q_STRIPE))
41 41
42struct btrfs_raid_bio;
43struct btrfs_device;
44
42int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 45int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
43 struct btrfs_bio *bbio, u64 *raid_map, 46 struct btrfs_bio *bbio, u64 *raid_map,
44 u64 stripe_len, int mirror_num); 47 u64 stripe_len, int mirror_num, int generic_io);
45int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 48int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
46 struct btrfs_bio *bbio, u64 *raid_map, 49 struct btrfs_bio *bbio, u64 *raid_map,
47 u64 stripe_len); 50 u64 stripe_len);
48 51
52struct btrfs_raid_bio *
53raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
54 struct btrfs_bio *bbio, u64 *raid_map,
55 u64 stripe_len, struct btrfs_device *scrub_dev,
56 unsigned long *dbitmap, int stripe_nsectors);
57void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
58 struct page *page, u64 logical);
59void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
60
49int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); 61int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
50void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); 62void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
51#endif 63#endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efa083113827..f2bb13a23f86 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -63,10 +63,18 @@ struct scrub_ctx;
63 */ 63 */
64#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 64#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
65 65
66struct scrub_recover {
67 atomic_t refs;
68 struct btrfs_bio *bbio;
69 u64 *raid_map;
70 u64 map_length;
71};
72
66struct scrub_page { 73struct scrub_page {
67 struct scrub_block *sblock; 74 struct scrub_block *sblock;
68 struct page *page; 75 struct page *page;
69 struct btrfs_device *dev; 76 struct btrfs_device *dev;
77 struct list_head list;
70 u64 flags; /* extent flags */ 78 u64 flags; /* extent flags */
71 u64 generation; 79 u64 generation;
72 u64 logical; 80 u64 logical;
@@ -79,6 +87,8 @@ struct scrub_page {
79 unsigned int io_error:1; 87 unsigned int io_error:1;
80 }; 88 };
81 u8 csum[BTRFS_CSUM_SIZE]; 89 u8 csum[BTRFS_CSUM_SIZE];
90
91 struct scrub_recover *recover;
82}; 92};
83 93
84struct scrub_bio { 94struct scrub_bio {
@@ -105,14 +115,52 @@ struct scrub_block {
105 atomic_t outstanding_pages; 115 atomic_t outstanding_pages;
106 atomic_t ref_count; /* free mem on transition to zero */ 116 atomic_t ref_count; /* free mem on transition to zero */
107 struct scrub_ctx *sctx; 117 struct scrub_ctx *sctx;
118 struct scrub_parity *sparity;
108 struct { 119 struct {
109 unsigned int header_error:1; 120 unsigned int header_error:1;
110 unsigned int checksum_error:1; 121 unsigned int checksum_error:1;
111 unsigned int no_io_error_seen:1; 122 unsigned int no_io_error_seen:1;
112 unsigned int generation_error:1; /* also sets header_error */ 123 unsigned int generation_error:1; /* also sets header_error */
124
125 /* The following is for the data used to check parity */
126 /* It is for the data with checksum */
127 unsigned int data_corrected:1;
113 }; 128 };
114}; 129};
115 130
131/* Used for the chunks with parity stripe such RAID5/6 */
132struct scrub_parity {
133 struct scrub_ctx *sctx;
134
135 struct btrfs_device *scrub_dev;
136
137 u64 logic_start;
138
139 u64 logic_end;
140
141 int nsectors;
142
143 int stripe_len;
144
145 atomic_t ref_count;
146
147 struct list_head spages;
148
149 /* Work of parity check and repair */
150 struct btrfs_work work;
151
152 /* Mark the parity blocks which have data */
153 unsigned long *dbitmap;
154
155 /*
156 * Mark the parity blocks which have data, but errors happen when
157 * read data or check data
158 */
159 unsigned long *ebitmap;
160
161 unsigned long bitmap[0];
162};
163
116struct scrub_wr_ctx { 164struct scrub_wr_ctx {
117 struct scrub_bio *wr_curr_bio; 165 struct scrub_bio *wr_curr_bio;
118 struct btrfs_device *tgtdev; 166 struct btrfs_device *tgtdev;
@@ -196,7 +244,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
196static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 244static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
197 struct scrub_block *sblock, int is_metadata, 245 struct scrub_block *sblock, int is_metadata,
198 int have_csum, u8 *csum, u64 generation, 246 int have_csum, u8 *csum, u64 generation,
199 u16 csum_size); 247 u16 csum_size, int retry_failed_mirror);
200static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 248static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
201 struct scrub_block *sblock, 249 struct scrub_block *sblock,
202 int is_metadata, int have_csum, 250 int is_metadata, int have_csum,
@@ -218,6 +266,8 @@ static void scrub_block_get(struct scrub_block *sblock);
218static void scrub_block_put(struct scrub_block *sblock); 266static void scrub_block_put(struct scrub_block *sblock);
219static void scrub_page_get(struct scrub_page *spage); 267static void scrub_page_get(struct scrub_page *spage);
220static void scrub_page_put(struct scrub_page *spage); 268static void scrub_page_put(struct scrub_page *spage);
269static void scrub_parity_get(struct scrub_parity *sparity);
270static void scrub_parity_put(struct scrub_parity *sparity);
221static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 271static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
222 struct scrub_page *spage); 272 struct scrub_page *spage);
223static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 273static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -790,6 +840,20 @@ out:
790 scrub_pending_trans_workers_dec(sctx); 840 scrub_pending_trans_workers_dec(sctx);
791} 841}
792 842
843static inline void scrub_get_recover(struct scrub_recover *recover)
844{
845 atomic_inc(&recover->refs);
846}
847
848static inline void scrub_put_recover(struct scrub_recover *recover)
849{
850 if (atomic_dec_and_test(&recover->refs)) {
851 kfree(recover->bbio);
852 kfree(recover->raid_map);
853 kfree(recover);
854 }
855}
856
793/* 857/*
794 * scrub_handle_errored_block gets called when either verification of the 858 * scrub_handle_errored_block gets called when either verification of the
795 * pages failed or the bio failed to read, e.g. with EIO. In the latter 859 * pages failed or the bio failed to read, e.g. with EIO. In the latter
@@ -906,7 +970,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
906 970
907 /* build and submit the bios for the failed mirror, check checksums */ 971 /* build and submit the bios for the failed mirror, check checksums */
908 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 972 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
909 csum, generation, sctx->csum_size); 973 csum, generation, sctx->csum_size, 1);
910 974
911 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 975 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
912 sblock_bad->no_io_error_seen) { 976 sblock_bad->no_io_error_seen) {
@@ -920,6 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
920 */ 984 */
921 spin_lock(&sctx->stat_lock); 985 spin_lock(&sctx->stat_lock);
922 sctx->stat.unverified_errors++; 986 sctx->stat.unverified_errors++;
987 sblock_to_check->data_corrected = 1;
923 spin_unlock(&sctx->stat_lock); 988 spin_unlock(&sctx->stat_lock);
924 989
925 if (sctx->is_dev_replace) 990 if (sctx->is_dev_replace)
@@ -1019,7 +1084,7 @@ nodatasum_case:
1019 /* build and submit the bios, check checksums */ 1084 /* build and submit the bios, check checksums */
1020 scrub_recheck_block(fs_info, sblock_other, is_metadata, 1085 scrub_recheck_block(fs_info, sblock_other, is_metadata,
1021 have_csum, csum, generation, 1086 have_csum, csum, generation,
1022 sctx->csum_size); 1087 sctx->csum_size, 0);
1023 1088
1024 if (!sblock_other->header_error && 1089 if (!sblock_other->header_error &&
1025 !sblock_other->checksum_error && 1090 !sblock_other->checksum_error &&
@@ -1169,7 +1234,7 @@ nodatasum_case:
1169 */ 1234 */
1170 scrub_recheck_block(fs_info, sblock_bad, 1235 scrub_recheck_block(fs_info, sblock_bad,
1171 is_metadata, have_csum, csum, 1236 is_metadata, have_csum, csum,
1172 generation, sctx->csum_size); 1237 generation, sctx->csum_size, 1);
1173 if (!sblock_bad->header_error && 1238 if (!sblock_bad->header_error &&
1174 !sblock_bad->checksum_error && 1239 !sblock_bad->checksum_error &&
1175 sblock_bad->no_io_error_seen) 1240 sblock_bad->no_io_error_seen)
@@ -1180,6 +1245,7 @@ nodatasum_case:
1180corrected_error: 1245corrected_error:
1181 spin_lock(&sctx->stat_lock); 1246 spin_lock(&sctx->stat_lock);
1182 sctx->stat.corrected_errors++; 1247 sctx->stat.corrected_errors++;
1248 sblock_to_check->data_corrected = 1;
1183 spin_unlock(&sctx->stat_lock); 1249 spin_unlock(&sctx->stat_lock);
1184 printk_ratelimited_in_rcu(KERN_ERR 1250 printk_ratelimited_in_rcu(KERN_ERR
1185 "BTRFS: fixed up error at logical %llu on dev %s\n", 1251 "BTRFS: fixed up error at logical %llu on dev %s\n",
@@ -1201,11 +1267,18 @@ out:
1201 mirror_index++) { 1267 mirror_index++) {
1202 struct scrub_block *sblock = sblocks_for_recheck + 1268 struct scrub_block *sblock = sblocks_for_recheck +
1203 mirror_index; 1269 mirror_index;
1270 struct scrub_recover *recover;
1204 int page_index; 1271 int page_index;
1205 1272
1206 for (page_index = 0; page_index < sblock->page_count; 1273 for (page_index = 0; page_index < sblock->page_count;
1207 page_index++) { 1274 page_index++) {
1208 sblock->pagev[page_index]->sblock = NULL; 1275 sblock->pagev[page_index]->sblock = NULL;
1276 recover = sblock->pagev[page_index]->recover;
1277 if (recover) {
1278 scrub_put_recover(recover);
1279 sblock->pagev[page_index]->recover =
1280 NULL;
1281 }
1209 scrub_page_put(sblock->pagev[page_index]); 1282 scrub_page_put(sblock->pagev[page_index]);
1210 } 1283 }
1211 } 1284 }
@@ -1215,14 +1288,63 @@ out:
1215 return 0; 1288 return 0;
1216} 1289}
1217 1290
1291static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
1292{
1293 if (raid_map) {
1294 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
1295 return 3;
1296 else
1297 return 2;
1298 } else {
1299 return (int)bbio->num_stripes;
1300 }
1301}
1302
1303static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
1304 u64 mapped_length,
1305 int nstripes, int mirror,
1306 int *stripe_index,
1307 u64 *stripe_offset)
1308{
1309 int i;
1310
1311 if (raid_map) {
1312 /* RAID5/6 */
1313 for (i = 0; i < nstripes; i++) {
1314 if (raid_map[i] == RAID6_Q_STRIPE ||
1315 raid_map[i] == RAID5_P_STRIPE)
1316 continue;
1317
1318 if (logical >= raid_map[i] &&
1319 logical < raid_map[i] + mapped_length)
1320 break;
1321 }
1322
1323 *stripe_index = i;
1324 *stripe_offset = logical - raid_map[i];
1325 } else {
1326 /* The other RAID type */
1327 *stripe_index = mirror;
1328 *stripe_offset = 0;
1329 }
1330}
1331
1218static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 1332static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1219 struct btrfs_fs_info *fs_info, 1333 struct btrfs_fs_info *fs_info,
1220 struct scrub_block *original_sblock, 1334 struct scrub_block *original_sblock,
1221 u64 length, u64 logical, 1335 u64 length, u64 logical,
1222 struct scrub_block *sblocks_for_recheck) 1336 struct scrub_block *sblocks_for_recheck)
1223{ 1337{
1338 struct scrub_recover *recover;
1339 struct btrfs_bio *bbio;
1340 u64 *raid_map;
1341 u64 sublen;
1342 u64 mapped_length;
1343 u64 stripe_offset;
1344 int stripe_index;
1224 int page_index; 1345 int page_index;
1225 int mirror_index; 1346 int mirror_index;
1347 int nmirrors;
1226 int ret; 1348 int ret;
1227 1349
1228 /* 1350 /*
@@ -1233,23 +1355,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1233 1355
1234 page_index = 0; 1356 page_index = 0;
1235 while (length > 0) { 1357 while (length > 0) {
1236 u64 sublen = min_t(u64, length, PAGE_SIZE); 1358 sublen = min_t(u64, length, PAGE_SIZE);
1237 u64 mapped_length = sublen; 1359 mapped_length = sublen;
1238 struct btrfs_bio *bbio = NULL; 1360 bbio = NULL;
1361 raid_map = NULL;
1239 1362
1240 /* 1363 /*
1241 * with a length of PAGE_SIZE, each returned stripe 1364 * with a length of PAGE_SIZE, each returned stripe
1242 * represents one mirror 1365 * represents one mirror
1243 */ 1366 */
1244 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, 1367 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
1245 &mapped_length, &bbio, 0); 1368 &mapped_length, &bbio, 0, &raid_map);
1246 if (ret || !bbio || mapped_length < sublen) { 1369 if (ret || !bbio || mapped_length < sublen) {
1247 kfree(bbio); 1370 kfree(bbio);
1371 kfree(raid_map);
1248 return -EIO; 1372 return -EIO;
1249 } 1373 }
1250 1374
1375 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1376 if (!recover) {
1377 kfree(bbio);
1378 kfree(raid_map);
1379 return -ENOMEM;
1380 }
1381
1382 atomic_set(&recover->refs, 1);
1383 recover->bbio = bbio;
1384 recover->raid_map = raid_map;
1385 recover->map_length = mapped_length;
1386
1251 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); 1387 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1252 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1388
1389 nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
1390 for (mirror_index = 0; mirror_index < nmirrors;
1253 mirror_index++) { 1391 mirror_index++) {
1254 struct scrub_block *sblock; 1392 struct scrub_block *sblock;
1255 struct scrub_page *page; 1393 struct scrub_page *page;
@@ -1265,26 +1403,38 @@ leave_nomem:
1265 spin_lock(&sctx->stat_lock); 1403 spin_lock(&sctx->stat_lock);
1266 sctx->stat.malloc_errors++; 1404 sctx->stat.malloc_errors++;
1267 spin_unlock(&sctx->stat_lock); 1405 spin_unlock(&sctx->stat_lock);
1268 kfree(bbio); 1406 scrub_put_recover(recover);
1269 return -ENOMEM; 1407 return -ENOMEM;
1270 } 1408 }
1271 scrub_page_get(page); 1409 scrub_page_get(page);
1272 sblock->pagev[page_index] = page; 1410 sblock->pagev[page_index] = page;
1273 page->logical = logical; 1411 page->logical = logical;
1274 page->physical = bbio->stripes[mirror_index].physical; 1412
1413 scrub_stripe_index_and_offset(logical, raid_map,
1414 mapped_length,
1415 bbio->num_stripes,
1416 mirror_index,
1417 &stripe_index,
1418 &stripe_offset);
1419 page->physical = bbio->stripes[stripe_index].physical +
1420 stripe_offset;
1421 page->dev = bbio->stripes[stripe_index].dev;
1422
1275 BUG_ON(page_index >= original_sblock->page_count); 1423 BUG_ON(page_index >= original_sblock->page_count);
1276 page->physical_for_dev_replace = 1424 page->physical_for_dev_replace =
1277 original_sblock->pagev[page_index]-> 1425 original_sblock->pagev[page_index]->
1278 physical_for_dev_replace; 1426 physical_for_dev_replace;
1279 /* for missing devices, dev->bdev is NULL */ 1427 /* for missing devices, dev->bdev is NULL */
1280 page->dev = bbio->stripes[mirror_index].dev;
1281 page->mirror_num = mirror_index + 1; 1428 page->mirror_num = mirror_index + 1;
1282 sblock->page_count++; 1429 sblock->page_count++;
1283 page->page = alloc_page(GFP_NOFS); 1430 page->page = alloc_page(GFP_NOFS);
1284 if (!page->page) 1431 if (!page->page)
1285 goto leave_nomem; 1432 goto leave_nomem;
1433
1434 scrub_get_recover(recover);
1435 page->recover = recover;
1286 } 1436 }
1287 kfree(bbio); 1437 scrub_put_recover(recover);
1288 length -= sublen; 1438 length -= sublen;
1289 logical += sublen; 1439 logical += sublen;
1290 page_index++; 1440 page_index++;
@@ -1293,6 +1443,51 @@ leave_nomem:
1293 return 0; 1443 return 0;
1294} 1444}
1295 1445
1446struct scrub_bio_ret {
1447 struct completion event;
1448 int error;
1449};
1450
1451static void scrub_bio_wait_endio(struct bio *bio, int error)
1452{
1453 struct scrub_bio_ret *ret = bio->bi_private;
1454
1455 ret->error = error;
1456 complete(&ret->event);
1457}
1458
1459static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1460{
1461 return page->recover && page->recover->raid_map;
1462}
1463
1464static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1465 struct bio *bio,
1466 struct scrub_page *page)
1467{
1468 struct scrub_bio_ret done;
1469 int ret;
1470
1471 init_completion(&done.event);
1472 done.error = 0;
1473 bio->bi_iter.bi_sector = page->logical >> 9;
1474 bio->bi_private = &done;
1475 bio->bi_end_io = scrub_bio_wait_endio;
1476
1477 ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
1478 page->recover->raid_map,
1479 page->recover->map_length,
1480 page->mirror_num, 0);
1481 if (ret)
1482 return ret;
1483
1484 wait_for_completion(&done.event);
1485 if (done.error)
1486 return -EIO;
1487
1488 return 0;
1489}
1490
1296/* 1491/*
1297 * this function will check the on disk data for checksum errors, header 1492 * this function will check the on disk data for checksum errors, header
1298 * errors and read I/O errors. If any I/O errors happen, the exact pages 1493 * errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1303,7 +1498,7 @@ leave_nomem:
1303static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 1498static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1304 struct scrub_block *sblock, int is_metadata, 1499 struct scrub_block *sblock, int is_metadata,
1305 int have_csum, u8 *csum, u64 generation, 1500 int have_csum, u8 *csum, u64 generation,
1306 u16 csum_size) 1501 u16 csum_size, int retry_failed_mirror)
1307{ 1502{
1308 int page_num; 1503 int page_num;
1309 1504
@@ -1329,11 +1524,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1329 continue; 1524 continue;
1330 } 1525 }
1331 bio->bi_bdev = page->dev->bdev; 1526 bio->bi_bdev = page->dev->bdev;
1332 bio->bi_iter.bi_sector = page->physical >> 9;
1333 1527
1334 bio_add_page(bio, page->page, PAGE_SIZE, 0); 1528 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1335 if (btrfsic_submit_bio_wait(READ, bio)) 1529 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1336 sblock->no_io_error_seen = 0; 1530 if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
1531 sblock->no_io_error_seen = 0;
1532 } else {
1533 bio->bi_iter.bi_sector = page->physical >> 9;
1534
1535 if (btrfsic_submit_bio_wait(READ, bio))
1536 sblock->no_io_error_seen = 0;
1537 }
1337 1538
1338 bio_put(bio); 1539 bio_put(bio);
1339 } 1540 }
@@ -1486,6 +1687,13 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1486{ 1687{
1487 int page_num; 1688 int page_num;
1488 1689
1690 /*
1691 * This block is used for the check of the parity on the source device,
1692 * so the data needn't be written into the destination device.
1693 */
1694 if (sblock->sparity)
1695 return;
1696
1489 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1697 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1490 int ret; 1698 int ret;
1491 1699
@@ -1867,6 +2075,9 @@ static void scrub_block_put(struct scrub_block *sblock)
1867 if (atomic_dec_and_test(&sblock->ref_count)) { 2075 if (atomic_dec_and_test(&sblock->ref_count)) {
1868 int i; 2076 int i;
1869 2077
2078 if (sblock->sparity)
2079 scrub_parity_put(sblock->sparity);
2080
1870 for (i = 0; i < sblock->page_count; i++) 2081 for (i = 0; i < sblock->page_count; i++)
1871 scrub_page_put(sblock->pagev[i]); 2082 scrub_page_put(sblock->pagev[i]);
1872 kfree(sblock); 2083 kfree(sblock);
@@ -2124,9 +2335,51 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
2124 scrub_pending_bio_dec(sctx); 2335 scrub_pending_bio_dec(sctx);
2125} 2336}
2126 2337
2338static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2339 unsigned long *bitmap,
2340 u64 start, u64 len)
2341{
2342 int offset;
2343 int nsectors;
2344 int sectorsize = sparity->sctx->dev_root->sectorsize;
2345
2346 if (len >= sparity->stripe_len) {
2347 bitmap_set(bitmap, 0, sparity->nsectors);
2348 return;
2349 }
2350
2351 start -= sparity->logic_start;
2352 offset = (int)do_div(start, sparity->stripe_len);
2353 offset /= sectorsize;
2354 nsectors = (int)len / sectorsize;
2355
2356 if (offset + nsectors <= sparity->nsectors) {
2357 bitmap_set(bitmap, offset, nsectors);
2358 return;
2359 }
2360
2361 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2362 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2363}
2364
2365static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2366 u64 start, u64 len)
2367{
2368 __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2369}
2370
2371static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2372 u64 start, u64 len)
2373{
2374 __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2375}
2376
2127static void scrub_block_complete(struct scrub_block *sblock) 2377static void scrub_block_complete(struct scrub_block *sblock)
2128{ 2378{
2379 int corrupted = 0;
2380
2129 if (!sblock->no_io_error_seen) { 2381 if (!sblock->no_io_error_seen) {
2382 corrupted = 1;
2130 scrub_handle_errored_block(sblock); 2383 scrub_handle_errored_block(sblock);
2131 } else { 2384 } else {
2132 /* 2385 /*
@@ -2134,9 +2387,19 @@ static void scrub_block_complete(struct scrub_block *sblock)
2134 * dev replace case, otherwise write here in dev replace 2387 * dev replace case, otherwise write here in dev replace
2135 * case. 2388 * case.
2136 */ 2389 */
2137 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) 2390 corrupted = scrub_checksum(sblock);
2391 if (!corrupted && sblock->sctx->is_dev_replace)
2138 scrub_write_block_to_dev_replace(sblock); 2392 scrub_write_block_to_dev_replace(sblock);
2139 } 2393 }
2394
2395 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2396 u64 start = sblock->pagev[0]->logical;
2397 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2398 PAGE_SIZE;
2399
2400 scrub_parity_mark_sectors_error(sblock->sparity,
2401 start, end - start);
2402 }
2140} 2403}
2141 2404
2142static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, 2405static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -2228,6 +2491,132 @@ behind_scrub_pages:
2228 return 0; 2491 return 0;
2229} 2492}
2230 2493
2494static int scrub_pages_for_parity(struct scrub_parity *sparity,
2495 u64 logical, u64 len,
2496 u64 physical, struct btrfs_device *dev,
2497 u64 flags, u64 gen, int mirror_num, u8 *csum)
2498{
2499 struct scrub_ctx *sctx = sparity->sctx;
2500 struct scrub_block *sblock;
2501 int index;
2502
2503 sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2504 if (!sblock) {
2505 spin_lock(&sctx->stat_lock);
2506 sctx->stat.malloc_errors++;
2507 spin_unlock(&sctx->stat_lock);
2508 return -ENOMEM;
2509 }
2510
2511 /* one ref inside this function, plus one for each page added to
2512 * a bio later on */
2513 atomic_set(&sblock->ref_count, 1);
2514 sblock->sctx = sctx;
2515 sblock->no_io_error_seen = 1;
2516 sblock->sparity = sparity;
2517 scrub_parity_get(sparity);
2518
2519 for (index = 0; len > 0; index++) {
2520 struct scrub_page *spage;
2521 u64 l = min_t(u64, len, PAGE_SIZE);
2522
2523 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2524 if (!spage) {
2525leave_nomem:
2526 spin_lock(&sctx->stat_lock);
2527 sctx->stat.malloc_errors++;
2528 spin_unlock(&sctx->stat_lock);
2529 scrub_block_put(sblock);
2530 return -ENOMEM;
2531 }
2532 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2533 /* For scrub block */
2534 scrub_page_get(spage);
2535 sblock->pagev[index] = spage;
2536 /* For scrub parity */
2537 scrub_page_get(spage);
2538 list_add_tail(&spage->list, &sparity->spages);
2539 spage->sblock = sblock;
2540 spage->dev = dev;
2541 spage->flags = flags;
2542 spage->generation = gen;
2543 spage->logical = logical;
2544 spage->physical = physical;
2545 spage->mirror_num = mirror_num;
2546 if (csum) {
2547 spage->have_csum = 1;
2548 memcpy(spage->csum, csum, sctx->csum_size);
2549 } else {
2550 spage->have_csum = 0;
2551 }
2552 sblock->page_count++;
2553 spage->page = alloc_page(GFP_NOFS);
2554 if (!spage->page)
2555 goto leave_nomem;
2556 len -= l;
2557 logical += l;
2558 physical += l;
2559 }
2560
2561 WARN_ON(sblock->page_count == 0);
2562 for (index = 0; index < sblock->page_count; index++) {
2563 struct scrub_page *spage = sblock->pagev[index];
2564 int ret;
2565
2566 ret = scrub_add_page_to_rd_bio(sctx, spage);
2567 if (ret) {
2568 scrub_block_put(sblock);
2569 return ret;
2570 }
2571 }
2572
2573 /* last one frees, either here or in bio completion for last page */
2574 scrub_block_put(sblock);
2575 return 0;
2576}
2577
2578static int scrub_extent_for_parity(struct scrub_parity *sparity,
2579 u64 logical, u64 len,
2580 u64 physical, struct btrfs_device *dev,
2581 u64 flags, u64 gen, int mirror_num)
2582{
2583 struct scrub_ctx *sctx = sparity->sctx;
2584 int ret;
2585 u8 csum[BTRFS_CSUM_SIZE];
2586 u32 blocksize;
2587
2588 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2589 blocksize = sctx->sectorsize;
2590 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2591 blocksize = sctx->nodesize;
2592 } else {
2593 blocksize = sctx->sectorsize;
2594 WARN_ON(1);
2595 }
2596
2597 while (len) {
2598 u64 l = min_t(u64, len, blocksize);
2599 int have_csum = 0;
2600
2601 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2602 /* push csums to sbio */
2603 have_csum = scrub_find_csum(sctx, logical, l, csum);
2604 if (have_csum == 0)
2605 goto skip;
2606 }
2607 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2608 flags, gen, mirror_num,
2609 have_csum ? csum : NULL);
2610skip:
2611 if (ret)
2612 return ret;
2613 len -= l;
2614 logical += l;
2615 physical += l;
2616 }
2617 return 0;
2618}
2619
2231/* 2620/*
2232 * Given a physical address, this will calculate it's 2621 * Given a physical address, this will calculate it's
2233 * logical offset. if this is a parity stripe, it will return 2622 * logical offset. if this is a parity stripe, it will return
@@ -2236,7 +2625,8 @@ behind_scrub_pages:
2236 * return 0 if it is a data stripe, 1 means parity stripe. 2625 * return 0 if it is a data stripe, 1 means parity stripe.
2237 */ 2626 */
2238static int get_raid56_logic_offset(u64 physical, int num, 2627static int get_raid56_logic_offset(u64 physical, int num,
2239 struct map_lookup *map, u64 *offset) 2628 struct map_lookup *map, u64 *offset,
2629 u64 *stripe_start)
2240{ 2630{
2241 int i; 2631 int i;
2242 int j = 0; 2632 int j = 0;
@@ -2247,6 +2637,9 @@ static int get_raid56_logic_offset(u64 physical, int num,
2247 2637
2248 last_offset = (physical - map->stripes[num].physical) * 2638 last_offset = (physical - map->stripes[num].physical) *
2249 nr_data_stripes(map); 2639 nr_data_stripes(map);
2640 if (stripe_start)
2641 *stripe_start = last_offset;
2642
2250 *offset = last_offset; 2643 *offset = last_offset;
2251 for (i = 0; i < nr_data_stripes(map); i++) { 2644 for (i = 0; i < nr_data_stripes(map); i++) {
2252 *offset = last_offset + i * map->stripe_len; 2645 *offset = last_offset + i * map->stripe_len;
@@ -2269,13 +2662,330 @@ static int get_raid56_logic_offset(u64 physical, int num,
2269 return 1; 2662 return 1;
2270} 2663}
2271 2664
2665static void scrub_free_parity(struct scrub_parity *sparity)
2666{
2667 struct scrub_ctx *sctx = sparity->sctx;
2668 struct scrub_page *curr, *next;
2669 int nbits;
2670
2671 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2672 if (nbits) {
2673 spin_lock(&sctx->stat_lock);
2674 sctx->stat.read_errors += nbits;
2675 sctx->stat.uncorrectable_errors += nbits;
2676 spin_unlock(&sctx->stat_lock);
2677 }
2678
2679 list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2680 list_del_init(&curr->list);
2681 scrub_page_put(curr);
2682 }
2683
2684 kfree(sparity);
2685}
2686
2687static void scrub_parity_bio_endio(struct bio *bio, int error)
2688{
2689 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2690 struct scrub_ctx *sctx = sparity->sctx;
2691
2692 if (error)
2693 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2694 sparity->nsectors);
2695
2696 scrub_free_parity(sparity);
2697 scrub_pending_bio_dec(sctx);
2698 bio_put(bio);
2699}
2700
2701static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2702{
2703 struct scrub_ctx *sctx = sparity->sctx;
2704 struct bio *bio;
2705 struct btrfs_raid_bio *rbio;
2706 struct scrub_page *spage;
2707 struct btrfs_bio *bbio = NULL;
2708 u64 *raid_map = NULL;
2709 u64 length;
2710 int ret;
2711
2712 if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2713 sparity->nsectors))
2714 goto out;
2715
2716 length = sparity->logic_end - sparity->logic_start + 1;
2717 ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
2718 sparity->logic_start,
2719 &length, &bbio, 0, &raid_map);
2720 if (ret || !bbio || !raid_map)
2721 goto bbio_out;
2722
2723 bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2724 if (!bio)
2725 goto bbio_out;
2726
2727 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2728 bio->bi_private = sparity;
2729 bio->bi_end_io = scrub_parity_bio_endio;
2730
2731 rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
2732 raid_map, length,
2733 sparity->scrub_dev,
2734 sparity->dbitmap,
2735 sparity->nsectors);
2736 if (!rbio)
2737 goto rbio_out;
2738
2739 list_for_each_entry(spage, &sparity->spages, list)
2740 raid56_parity_add_scrub_pages(rbio, spage->page,
2741 spage->logical);
2742
2743 scrub_pending_bio_inc(sctx);
2744 raid56_parity_submit_scrub_rbio(rbio);
2745 return;
2746
2747rbio_out:
2748 bio_put(bio);
2749bbio_out:
2750 kfree(bbio);
2751 kfree(raid_map);
2752 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2753 sparity->nsectors);
2754 spin_lock(&sctx->stat_lock);
2755 sctx->stat.malloc_errors++;
2756 spin_unlock(&sctx->stat_lock);
2757out:
2758 scrub_free_parity(sparity);
2759}
2760
2761static inline int scrub_calc_parity_bitmap_len(int nsectors)
2762{
2763 return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
2764}
2765
2766static void scrub_parity_get(struct scrub_parity *sparity)
2767{
2768 atomic_inc(&sparity->ref_count);
2769}
2770
2771static void scrub_parity_put(struct scrub_parity *sparity)
2772{
2773 if (!atomic_dec_and_test(&sparity->ref_count))
2774 return;
2775
2776 scrub_parity_check_and_repair(sparity);
2777}
2778
2779static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2780 struct map_lookup *map,
2781 struct btrfs_device *sdev,
2782 struct btrfs_path *path,
2783 u64 logic_start,
2784 u64 logic_end)
2785{
2786 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2787 struct btrfs_root *root = fs_info->extent_root;
2788 struct btrfs_root *csum_root = fs_info->csum_root;
2789 struct btrfs_extent_item *extent;
2790 u64 flags;
2791 int ret;
2792 int slot;
2793 struct extent_buffer *l;
2794 struct btrfs_key key;
2795 u64 generation;
2796 u64 extent_logical;
2797 u64 extent_physical;
2798 u64 extent_len;
2799 struct btrfs_device *extent_dev;
2800 struct scrub_parity *sparity;
2801 int nsectors;
2802 int bitmap_len;
2803 int extent_mirror_num;
2804 int stop_loop = 0;
2805
2806 nsectors = map->stripe_len / root->sectorsize;
2807 bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2808 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2809 GFP_NOFS);
2810 if (!sparity) {
2811 spin_lock(&sctx->stat_lock);
2812 sctx->stat.malloc_errors++;
2813 spin_unlock(&sctx->stat_lock);
2814 return -ENOMEM;
2815 }
2816
2817 sparity->stripe_len = map->stripe_len;
2818 sparity->nsectors = nsectors;
2819 sparity->sctx = sctx;
2820 sparity->scrub_dev = sdev;
2821 sparity->logic_start = logic_start;
2822 sparity->logic_end = logic_end;
2823 atomic_set(&sparity->ref_count, 1);
2824 INIT_LIST_HEAD(&sparity->spages);
2825 sparity->dbitmap = sparity->bitmap;
2826 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2827
2828 ret = 0;
2829 while (logic_start < logic_end) {
2830 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2831 key.type = BTRFS_METADATA_ITEM_KEY;
2832 else
2833 key.type = BTRFS_EXTENT_ITEM_KEY;
2834 key.objectid = logic_start;
2835 key.offset = (u64)-1;
2836
2837 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2838 if (ret < 0)
2839 goto out;
2840
2841 if (ret > 0) {
2842 ret = btrfs_previous_extent_item(root, path, 0);
2843 if (ret < 0)
2844 goto out;
2845 if (ret > 0) {
2846 btrfs_release_path(path);
2847 ret = btrfs_search_slot(NULL, root, &key,
2848 path, 0, 0);
2849 if (ret < 0)
2850 goto out;
2851 }
2852 }
2853
2854 stop_loop = 0;
2855 while (1) {
2856 u64 bytes;
2857
2858 l = path->nodes[0];
2859 slot = path->slots[0];
2860 if (slot >= btrfs_header_nritems(l)) {
2861 ret = btrfs_next_leaf(root, path);
2862 if (ret == 0)
2863 continue;
2864 if (ret < 0)
2865 goto out;
2866
2867 stop_loop = 1;
2868 break;
2869 }
2870 btrfs_item_key_to_cpu(l, &key, slot);
2871
2872 if (key.type == BTRFS_METADATA_ITEM_KEY)
2873 bytes = root->nodesize;
2874 else
2875 bytes = key.offset;
2876
2877 if (key.objectid + bytes <= logic_start)
2878 goto next;
2879
2880 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2881 key.type != BTRFS_METADATA_ITEM_KEY)
2882 goto next;
2883
2884 if (key.objectid > logic_end) {
2885 stop_loop = 1;
2886 break;
2887 }
2888
2889 while (key.objectid >= logic_start + map->stripe_len)
2890 logic_start += map->stripe_len;
2891
2892 extent = btrfs_item_ptr(l, slot,
2893 struct btrfs_extent_item);
2894 flags = btrfs_extent_flags(l, extent);
2895 generation = btrfs_extent_generation(l, extent);
2896
2897 if (key.objectid < logic_start &&
2898 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2899 btrfs_err(fs_info,
2900 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2901 key.objectid, logic_start);
2902 goto next;
2903 }
2904again:
2905 extent_logical = key.objectid;
2906 extent_len = bytes;
2907
2908 if (extent_logical < logic_start) {
2909 extent_len -= logic_start - extent_logical;
2910 extent_logical = logic_start;
2911 }
2912
2913 if (extent_logical + extent_len >
2914 logic_start + map->stripe_len)
2915 extent_len = logic_start + map->stripe_len -
2916 extent_logical;
2917
2918 scrub_parity_mark_sectors_data(sparity, extent_logical,
2919 extent_len);
2920
2921 scrub_remap_extent(fs_info, extent_logical,
2922 extent_len, &extent_physical,
2923 &extent_dev,
2924 &extent_mirror_num);
2925
2926 ret = btrfs_lookup_csums_range(csum_root,
2927 extent_logical,
2928 extent_logical + extent_len - 1,
2929 &sctx->csum_list, 1);
2930 if (ret)
2931 goto out;
2932
2933 ret = scrub_extent_for_parity(sparity, extent_logical,
2934 extent_len,
2935 extent_physical,
2936 extent_dev, flags,
2937 generation,
2938 extent_mirror_num);
2939 if (ret)
2940 goto out;
2941
2942 scrub_free_csums(sctx);
2943 if (extent_logical + extent_len <
2944 key.objectid + bytes) {
2945 logic_start += map->stripe_len;
2946
2947 if (logic_start >= logic_end) {
2948 stop_loop = 1;
2949 break;
2950 }
2951
2952 if (logic_start < key.objectid + bytes) {
2953 cond_resched();
2954 goto again;
2955 }
2956 }
2957next:
2958 path->slots[0]++;
2959 }
2960
2961 btrfs_release_path(path);
2962
2963 if (stop_loop)
2964 break;
2965
2966 logic_start += map->stripe_len;
2967 }
2968out:
2969 if (ret < 0)
2970 scrub_parity_mark_sectors_error(sparity, logic_start,
2971 logic_end - logic_start + 1);
2972 scrub_parity_put(sparity);
2973 scrub_submit(sctx);
2974 mutex_lock(&sctx->wr_ctx.wr_lock);
2975 scrub_wr_submit(sctx);
2976 mutex_unlock(&sctx->wr_ctx.wr_lock);
2977
2978 btrfs_release_path(path);
2979 return ret < 0 ? ret : 0;
2980}
2981
2272static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2982static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2273 struct map_lookup *map, 2983 struct map_lookup *map,
2274 struct btrfs_device *scrub_dev, 2984 struct btrfs_device *scrub_dev,
2275 int num, u64 base, u64 length, 2985 int num, u64 base, u64 length,
2276 int is_dev_replace) 2986 int is_dev_replace)
2277{ 2987{
2278 struct btrfs_path *path; 2988 struct btrfs_path *path, *ppath;
2279 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 2989 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2280 struct btrfs_root *root = fs_info->extent_root; 2990 struct btrfs_root *root = fs_info->extent_root;
2281 struct btrfs_root *csum_root = fs_info->csum_root; 2991 struct btrfs_root *csum_root = fs_info->csum_root;
@@ -2302,6 +3012,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2302 u64 extent_logical; 3012 u64 extent_logical;
2303 u64 extent_physical; 3013 u64 extent_physical;
2304 u64 extent_len; 3014 u64 extent_len;
3015 u64 stripe_logical;
3016 u64 stripe_end;
2305 struct btrfs_device *extent_dev; 3017 struct btrfs_device *extent_dev;
2306 int extent_mirror_num; 3018 int extent_mirror_num;
2307 int stop_loop = 0; 3019 int stop_loop = 0;
@@ -2327,7 +3039,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2327 mirror_num = num % map->num_stripes + 1; 3039 mirror_num = num % map->num_stripes + 1;
2328 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3040 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2329 BTRFS_BLOCK_GROUP_RAID6)) { 3041 BTRFS_BLOCK_GROUP_RAID6)) {
2330 get_raid56_logic_offset(physical, num, map, &offset); 3042 get_raid56_logic_offset(physical, num, map, &offset, NULL);
2331 increment = map->stripe_len * nr_data_stripes(map); 3043 increment = map->stripe_len * nr_data_stripes(map);
2332 mirror_num = 1; 3044 mirror_num = 1;
2333 } else { 3045 } else {
@@ -2339,6 +3051,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2339 if (!path) 3051 if (!path)
2340 return -ENOMEM; 3052 return -ENOMEM;
2341 3053
3054 ppath = btrfs_alloc_path();
3055 if (!ppath) {
3056 btrfs_free_path(ppath);
3057 return -ENOMEM;
3058 }
3059
2342 /* 3060 /*
2343 * work on commit root. The related disk blocks are static as 3061 * work on commit root. The related disk blocks are static as
2344 * long as COW is applied. This means, it is save to rewrite 3062 * long as COW is applied. This means, it is save to rewrite
@@ -2357,7 +3075,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2357 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3075 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2358 BTRFS_BLOCK_GROUP_RAID6)) { 3076 BTRFS_BLOCK_GROUP_RAID6)) {
2359 get_raid56_logic_offset(physical_end, num, 3077 get_raid56_logic_offset(physical_end, num,
2360 map, &logic_end); 3078 map, &logic_end, NULL);
2361 logic_end += base; 3079 logic_end += base;
2362 } else { 3080 } else {
2363 logic_end = logical + increment * nstripes; 3081 logic_end = logical + increment * nstripes;
@@ -2404,10 +3122,18 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2404 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3122 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2405 BTRFS_BLOCK_GROUP_RAID6)) { 3123 BTRFS_BLOCK_GROUP_RAID6)) {
2406 ret = get_raid56_logic_offset(physical, num, 3124 ret = get_raid56_logic_offset(physical, num,
2407 map, &logical); 3125 map, &logical, &stripe_logical);
2408 logical += base; 3126 logical += base;
2409 if (ret) 3127 if (ret) {
3128 stripe_logical += base;
3129 stripe_end = stripe_logical + increment - 1;
3130 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3131 ppath, stripe_logical,
3132 stripe_end);
3133 if (ret)
3134 goto out;
2410 goto skip; 3135 goto skip;
3136 }
2411 } 3137 }
2412 /* 3138 /*
2413 * canceled? 3139 * canceled?
@@ -2558,13 +3284,25 @@ again:
2558 * loop until we find next data stripe 3284 * loop until we find next data stripe
2559 * or we have finished all stripes. 3285 * or we have finished all stripes.
2560 */ 3286 */
2561 do { 3287loop:
2562 physical += map->stripe_len; 3288 physical += map->stripe_len;
2563 ret = get_raid56_logic_offset( 3289 ret = get_raid56_logic_offset(physical,
2564 physical, num, 3290 num, map, &logical,
2565 map, &logical); 3291 &stripe_logical);
2566 logical += base; 3292 logical += base;
2567 } while (physical < physical_end && ret); 3293
3294 if (ret && physical < physical_end) {
3295 stripe_logical += base;
3296 stripe_end = stripe_logical +
3297 increment - 1;
3298 ret = scrub_raid56_parity(sctx,
3299 map, scrub_dev, ppath,
3300 stripe_logical,
3301 stripe_end);
3302 if (ret)
3303 goto out;
3304 goto loop;
3305 }
2568 } else { 3306 } else {
2569 physical += map->stripe_len; 3307 physical += map->stripe_len;
2570 logical += increment; 3308 logical += increment;
@@ -2605,6 +3343,7 @@ out:
2605 3343
2606 blk_finish_plug(&plug); 3344 blk_finish_plug(&plug);
2607 btrfs_free_path(path); 3345 btrfs_free_path(path);
3346 btrfs_free_path(ppath);
2608 return ret < 0 ? ret : 0; 3347 return ret < 0 ? ret : 0;
2609} 3348}
2610 3349
@@ -3310,6 +4049,50 @@ out:
3310 scrub_pending_trans_workers_dec(sctx); 4049 scrub_pending_trans_workers_dec(sctx);
3311} 4050}
3312 4051
4052static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
4053 u64 logical)
4054{
4055 struct extent_state *cached_state = NULL;
4056 struct btrfs_ordered_extent *ordered;
4057 struct extent_io_tree *io_tree;
4058 struct extent_map *em;
4059 u64 lockstart = start, lockend = start + len - 1;
4060 int ret = 0;
4061
4062 io_tree = &BTRFS_I(inode)->io_tree;
4063
4064 lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
4065 ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4066 if (ordered) {
4067 btrfs_put_ordered_extent(ordered);
4068 ret = 1;
4069 goto out_unlock;
4070 }
4071
4072 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4073 if (IS_ERR(em)) {
4074 ret = PTR_ERR(em);
4075 goto out_unlock;
4076 }
4077
4078 /*
4079 * This extent does not actually cover the logical extent anymore,
4080 * move on to the next inode.
4081 */
4082 if (em->block_start > logical ||
4083 em->block_start + em->block_len < logical + len) {
4084 free_extent_map(em);
4085 ret = 1;
4086 goto out_unlock;
4087 }
4088 free_extent_map(em);
4089
4090out_unlock:
4091 unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
4092 GFP_NOFS);
4093 return ret;
4094}
4095
3313static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, 4096static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3314 struct scrub_copy_nocow_ctx *nocow_ctx) 4097 struct scrub_copy_nocow_ctx *nocow_ctx)
3315{ 4098{
@@ -3318,13 +4101,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3318 struct inode *inode; 4101 struct inode *inode;
3319 struct page *page; 4102 struct page *page;
3320 struct btrfs_root *local_root; 4103 struct btrfs_root *local_root;
3321 struct btrfs_ordered_extent *ordered;
3322 struct extent_map *em;
3323 struct extent_state *cached_state = NULL;
3324 struct extent_io_tree *io_tree; 4104 struct extent_io_tree *io_tree;
3325 u64 physical_for_dev_replace; 4105 u64 physical_for_dev_replace;
4106 u64 nocow_ctx_logical;
3326 u64 len = nocow_ctx->len; 4107 u64 len = nocow_ctx->len;
3327 u64 lockstart = offset, lockend = offset + len - 1;
3328 unsigned long index; 4108 unsigned long index;
3329 int srcu_index; 4109 int srcu_index;
3330 int ret = 0; 4110 int ret = 0;
@@ -3356,30 +4136,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3356 4136
3357 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 4137 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3358 io_tree = &BTRFS_I(inode)->io_tree; 4138 io_tree = &BTRFS_I(inode)->io_tree;
4139 nocow_ctx_logical = nocow_ctx->logical;
3359 4140
3360 lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); 4141 ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
3361 ordered = btrfs_lookup_ordered_range(inode, lockstart, len); 4142 if (ret) {
3362 if (ordered) { 4143 ret = ret > 0 ? 0 : ret;
3363 btrfs_put_ordered_extent(ordered); 4144 goto out;
3364 goto out_unlock;
3365 }
3366
3367 em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
3368 if (IS_ERR(em)) {
3369 ret = PTR_ERR(em);
3370 goto out_unlock;
3371 }
3372
3373 /*
3374 * This extent does not actually cover the logical extent anymore,
3375 * move on to the next inode.
3376 */
3377 if (em->block_start > nocow_ctx->logical ||
3378 em->block_start + em->block_len < nocow_ctx->logical + len) {
3379 free_extent_map(em);
3380 goto out_unlock;
3381 } 4145 }
3382 free_extent_map(em);
3383 4146
3384 while (len >= PAGE_CACHE_SIZE) { 4147 while (len >= PAGE_CACHE_SIZE) {
3385 index = offset >> PAGE_CACHE_SHIFT; 4148 index = offset >> PAGE_CACHE_SHIFT;
@@ -3396,7 +4159,7 @@ again:
3396 goto next_page; 4159 goto next_page;
3397 } else { 4160 } else {
3398 ClearPageError(page); 4161 ClearPageError(page);
3399 err = extent_read_full_page_nolock(io_tree, page, 4162 err = extent_read_full_page(io_tree, page,
3400 btrfs_get_extent, 4163 btrfs_get_extent,
3401 nocow_ctx->mirror_num); 4164 nocow_ctx->mirror_num);
3402 if (err) { 4165 if (err) {
@@ -3421,6 +4184,14 @@ again:
3421 goto next_page; 4184 goto next_page;
3422 } 4185 }
3423 } 4186 }
4187
4188 ret = check_extent_to_block(inode, offset, len,
4189 nocow_ctx_logical);
4190 if (ret) {
4191 ret = ret > 0 ? 0 : ret;
4192 goto next_page;
4193 }
4194
3424 err = write_page_nocow(nocow_ctx->sctx, 4195 err = write_page_nocow(nocow_ctx->sctx,
3425 physical_for_dev_replace, page); 4196 physical_for_dev_replace, page);
3426 if (err) 4197 if (err)
@@ -3434,12 +4205,10 @@ next_page:
3434 4205
3435 offset += PAGE_CACHE_SIZE; 4206 offset += PAGE_CACHE_SIZE;
3436 physical_for_dev_replace += PAGE_CACHE_SIZE; 4207 physical_for_dev_replace += PAGE_CACHE_SIZE;
4208 nocow_ctx_logical += PAGE_CACHE_SIZE;
3437 len -= PAGE_CACHE_SIZE; 4209 len -= PAGE_CACHE_SIZE;
3438 } 4210 }
3439 ret = COPY_COMPLETE; 4211 ret = COPY_COMPLETE;
3440out_unlock:
3441 unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
3442 GFP_NOFS);
3443out: 4212out:
3444 mutex_unlock(&inode->i_mutex); 4213 mutex_unlock(&inode->i_mutex);
3445 iput(inode); 4214 iput(inode);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 874828dd0a86..804432dbc351 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5507,6 +5507,51 @@ out:
5507 return ret; 5507 return ret;
5508} 5508}
5509 5509
5510/*
5511 * If orphan cleanup did remove any orphans from a root, it means the tree
5512 * was modified and therefore the commit root is not the same as the current
5513 * root anymore. This is a problem, because send uses the commit root and
5514 * therefore can see inode items that don't exist in the current root anymore,
5515 * and for example make calls to btrfs_iget, which will do tree lookups based
5516 * on the current root and not on the commit root. Those lookups will fail,
5517 * returning a -ESTALE error, and making send fail with that error. So make
5518 * sure a send does not see any orphans we have just removed, and that it will
5519 * see the same inodes regardless of whether a transaction commit happened
5520 * before it started (meaning that the commit root will be the same as the
5521 * current root) or not.
5522 */
5523static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
5524{
5525 int i;
5526 struct btrfs_trans_handle *trans = NULL;
5527
5528again:
5529 if (sctx->parent_root &&
5530 sctx->parent_root->node != sctx->parent_root->commit_root)
5531 goto commit_trans;
5532
5533 for (i = 0; i < sctx->clone_roots_cnt; i++)
5534 if (sctx->clone_roots[i].root->node !=
5535 sctx->clone_roots[i].root->commit_root)
5536 goto commit_trans;
5537
5538 if (trans)
5539 return btrfs_end_transaction(trans, sctx->send_root);
5540
5541 return 0;
5542
5543commit_trans:
5544 /* Use any root, all fs roots will get their commit roots updated. */
5545 if (!trans) {
5546 trans = btrfs_join_transaction(sctx->send_root);
5547 if (IS_ERR(trans))
5548 return PTR_ERR(trans);
5549 goto again;
5550 }
5551
5552 return btrfs_commit_transaction(trans, sctx->send_root);
5553}
5554
5510static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) 5555static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
5511{ 5556{
5512 spin_lock(&root->root_item_lock); 5557 spin_lock(&root->root_item_lock);
@@ -5728,6 +5773,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5728 NULL); 5773 NULL);
5729 sort_clone_roots = 1; 5774 sort_clone_roots = 1;
5730 5775
5776 ret = ensure_commit_roots_uptodate(sctx);
5777 if (ret)
5778 goto out;
5779
5731 current->journal_info = BTRFS_SEND_TRANS_STUB; 5780 current->journal_info = BTRFS_SEND_TRANS_STUB;
5732 ret = send_subvol(sctx); 5781 ret = send_subvol(sctx);
5733 current->journal_info = NULL; 5782 current->journal_info = NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 54bd91ece35b..60f7cbe815e9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -262,7 +262,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
262 trans->aborted = errno; 262 trans->aborted = errno;
263 /* Nothing used. The other threads that have joined this 263 /* Nothing used. The other threads that have joined this
264 * transaction may be able to continue. */ 264 * transaction may be able to continue. */
265 if (!trans->blocks_used) { 265 if (!trans->blocks_used && list_empty(&trans->new_bgs)) {
266 const char *errstr; 266 const char *errstr;
267 267
268 errstr = btrfs_decode_error(errno); 268 errstr = btrfs_decode_error(errno);
@@ -642,11 +642,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
642 "disabling disk space caching"); 642 "disabling disk space caching");
643 break; 643 break;
644 case Opt_inode_cache: 644 case Opt_inode_cache:
645 btrfs_set_and_info(root, CHANGE_INODE_CACHE, 645 btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
646 "enabling inode map caching"); 646 "enabling inode map caching");
647 break; 647 break;
648 case Opt_noinode_cache: 648 case Opt_noinode_cache:
649 btrfs_clear_and_info(root, CHANGE_INODE_CACHE, 649 btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
650 "disabling inode map caching"); 650 "disabling inode map caching");
651 break; 651 break;
652 case Opt_clear_cache: 652 case Opt_clear_cache:
@@ -993,9 +993,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
993 trans = btrfs_attach_transaction_barrier(root); 993 trans = btrfs_attach_transaction_barrier(root);
994 if (IS_ERR(trans)) { 994 if (IS_ERR(trans)) {
995 /* no transaction, don't bother */ 995 /* no transaction, don't bother */
996 if (PTR_ERR(trans) == -ENOENT) 996 if (PTR_ERR(trans) == -ENOENT) {
997 return 0; 997 /*
998 return PTR_ERR(trans); 998 * Exit unless we have some pending changes
999 * that need to go through commit
1000 */
1001 if (fs_info->pending_changes == 0)
1002 return 0;
1003 trans = btrfs_start_transaction(root, 0);
1004 } else {
1005 return PTR_ERR(trans);
1006 }
999 } 1007 }
1000 return btrfs_commit_transaction(trans, root); 1008 return btrfs_commit_transaction(trans, root);
1001} 1009}
@@ -1644,8 +1652,20 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1644 int i = 0, nr_devices; 1652 int i = 0, nr_devices;
1645 int ret; 1653 int ret;
1646 1654
1655 /*
1656 * We aren't under the device list lock, so this is racey-ish, but good
1657 * enough for our purposes.
1658 */
1647 nr_devices = fs_info->fs_devices->open_devices; 1659 nr_devices = fs_info->fs_devices->open_devices;
1648 BUG_ON(!nr_devices); 1660 if (!nr_devices) {
1661 smp_mb();
1662 nr_devices = fs_info->fs_devices->open_devices;
1663 ASSERT(nr_devices);
1664 if (!nr_devices) {
1665 *free_bytes = 0;
1666 return 0;
1667 }
1668 }
1649 1669
1650 devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), 1670 devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
1651 GFP_NOFS); 1671 GFP_NOFS);
@@ -1670,11 +1690,17 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1670 else 1690 else
1671 min_stripe_size = BTRFS_STRIPE_LEN; 1691 min_stripe_size = BTRFS_STRIPE_LEN;
1672 1692
1673 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1693 if (fs_info->alloc_start)
1694 mutex_lock(&fs_devices->device_list_mutex);
1695 rcu_read_lock();
1696 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
1674 if (!device->in_fs_metadata || !device->bdev || 1697 if (!device->in_fs_metadata || !device->bdev ||
1675 device->is_tgtdev_for_dev_replace) 1698 device->is_tgtdev_for_dev_replace)
1676 continue; 1699 continue;
1677 1700
1701 if (i >= nr_devices)
1702 break;
1703
1678 avail_space = device->total_bytes - device->bytes_used; 1704 avail_space = device->total_bytes - device->bytes_used;
1679 1705
1680 /* align with stripe_len */ 1706 /* align with stripe_len */
@@ -1689,24 +1715,32 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1689 skip_space = 1024 * 1024; 1715 skip_space = 1024 * 1024;
1690 1716
1691 /* user can set the offset in fs_info->alloc_start. */ 1717 /* user can set the offset in fs_info->alloc_start. */
1692 if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= 1718 if (fs_info->alloc_start &&
1693 device->total_bytes) 1719 fs_info->alloc_start + BTRFS_STRIPE_LEN <=
1720 device->total_bytes) {
1721 rcu_read_unlock();
1694 skip_space = max(fs_info->alloc_start, skip_space); 1722 skip_space = max(fs_info->alloc_start, skip_space);
1695 1723
1696 /* 1724 /*
1697 * btrfs can not use the free space in [0, skip_space - 1], 1725 * btrfs can not use the free space in
1698 * we must subtract it from the total. In order to implement 1726 * [0, skip_space - 1], we must subtract it from the
1699 * it, we account the used space in this range first. 1727 * total. In order to implement it, we account the used
1700 */ 1728 * space in this range first.
1701 ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, 1729 */
1702 &used_space); 1730 ret = btrfs_account_dev_extents_size(device, 0,
1703 if (ret) { 1731 skip_space - 1,
1704 kfree(devices_info); 1732 &used_space);
1705 return ret; 1733 if (ret) {
1706 } 1734 kfree(devices_info);
1735 mutex_unlock(&fs_devices->device_list_mutex);
1736 return ret;
1737 }
1707 1738
1708 /* calc the free space in [0, skip_space - 1] */ 1739 rcu_read_lock();
1709 skip_space -= used_space; 1740
1741 /* calc the free space in [0, skip_space - 1] */
1742 skip_space -= used_space;
1743 }
1710 1744
1711 /* 1745 /*
1712 * we can use the free space in [0, skip_space - 1], subtract 1746 * we can use the free space in [0, skip_space - 1], subtract
@@ -1725,6 +1759,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1725 1759
1726 i++; 1760 i++;
1727 } 1761 }
1762 rcu_read_unlock();
1763 if (fs_info->alloc_start)
1764 mutex_unlock(&fs_devices->device_list_mutex);
1728 1765
1729 nr_devices = i; 1766 nr_devices = i;
1730 1767
@@ -1787,8 +1824,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1787 * holding chunk_muext to avoid allocating new chunks, holding 1824 * holding chunk_muext to avoid allocating new chunks, holding
1788 * device_list_mutex to avoid the device being removed 1825 * device_list_mutex to avoid the device being removed
1789 */ 1826 */
1790 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1791 mutex_lock(&fs_info->chunk_mutex);
1792 rcu_read_lock(); 1827 rcu_read_lock();
1793 list_for_each_entry_rcu(found, head, list) { 1828 list_for_each_entry_rcu(found, head, list) {
1794 if (found->flags & BTRFS_BLOCK_GROUP_DATA) { 1829 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1824,17 +1859,12 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1824 buf->f_bfree -= block_rsv->size >> bits; 1859 buf->f_bfree -= block_rsv->size >> bits;
1825 spin_unlock(&block_rsv->lock); 1860 spin_unlock(&block_rsv->lock);
1826 1861
1827 buf->f_bavail = total_free_data; 1862 buf->f_bavail = div_u64(total_free_data, factor);
1828 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); 1863 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
1829 if (ret) { 1864 if (ret)
1830 mutex_unlock(&fs_info->chunk_mutex);
1831 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1832 return ret; 1865 return ret;
1833 }
1834 buf->f_bavail += div_u64(total_free_data, factor); 1866 buf->f_bavail += div_u64(total_free_data, factor);
1835 buf->f_bavail = buf->f_bavail >> bits; 1867 buf->f_bavail = buf->f_bavail >> bits;
1836 mutex_unlock(&fs_info->chunk_mutex);
1837 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1838 1868
1839 buf->f_type = BTRFS_SUPER_MAGIC; 1869 buf->f_type = BTRFS_SUPER_MAGIC;
1840 buf->f_bsize = dentry->d_sb->s_blocksize; 1870 buf->f_bsize = dentry->d_sb->s_blocksize;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b2e7bb4393f6..92db3f648df4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -111,7 +111,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
111{ 111{
112 struct btrfs_fs_info *fs_info; 112 struct btrfs_fs_info *fs_info;
113 struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); 113 struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
114 struct btrfs_trans_handle *trans;
115 u64 features, set, clear; 114 u64 features, set, clear;
116 unsigned long val; 115 unsigned long val;
117 int ret; 116 int ret;
@@ -153,10 +152,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
153 btrfs_info(fs_info, "%s %s feature flag", 152 btrfs_info(fs_info, "%s %s feature flag",
154 val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); 153 val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
155 154
156 trans = btrfs_start_transaction(fs_info->fs_root, 0);
157 if (IS_ERR(trans))
158 return PTR_ERR(trans);
159
160 spin_lock(&fs_info->super_lock); 155 spin_lock(&fs_info->super_lock);
161 features = get_features(fs_info, fa->feature_set); 156 features = get_features(fs_info, fa->feature_set);
162 if (val) 157 if (val)
@@ -166,9 +161,11 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
166 set_features(fs_info, fa->feature_set, features); 161 set_features(fs_info, fa->feature_set, features);
167 spin_unlock(&fs_info->super_lock); 162 spin_unlock(&fs_info->super_lock);
168 163
169 ret = btrfs_commit_transaction(trans, fs_info->fs_root); 164 /*
170 if (ret) 165 * We don't want to do full transaction commit from inside sysfs
171 return ret; 166 */
167 btrfs_set_pending(fs_info, COMMIT);
168 wake_up_process(fs_info->transaction_kthread);
172 169
173 return count; 170 return count;
174} 171}
@@ -372,9 +369,6 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
372 const char *buf, size_t len) 369 const char *buf, size_t len)
373{ 370{
374 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 371 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
375 struct btrfs_trans_handle *trans;
376 struct btrfs_root *root = fs_info->fs_root;
377 int ret;
378 size_t p_len; 372 size_t p_len;
379 373
380 if (fs_info->sb->s_flags & MS_RDONLY) 374 if (fs_info->sb->s_flags & MS_RDONLY)
@@ -389,20 +383,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
389 if (p_len >= BTRFS_LABEL_SIZE) 383 if (p_len >= BTRFS_LABEL_SIZE)
390 return -EINVAL; 384 return -EINVAL;
391 385
392 trans = btrfs_start_transaction(root, 0); 386 spin_lock(&fs_info->super_lock);
393 if (IS_ERR(trans))
394 return PTR_ERR(trans);
395
396 spin_lock(&root->fs_info->super_lock);
397 memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); 387 memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
398 memcpy(fs_info->super_copy->label, buf, p_len); 388 memcpy(fs_info->super_copy->label, buf, p_len);
399 spin_unlock(&root->fs_info->super_lock); 389 spin_unlock(&fs_info->super_lock);
400 ret = btrfs_commit_transaction(trans, root);
401 390
402 if (!ret) 391 /*
403 return len; 392 * We don't want to do full transaction commit from inside sysfs
393 */
394 btrfs_set_pending(fs_info, COMMIT);
395 wake_up_process(fs_info->transaction_kthread);
404 396
405 return ret; 397 return len;
406} 398}
407BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); 399BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
408 400
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dcaae3616728..a605d4e2f2bc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
76 } 76 }
77} 77}
78 78
79static void clear_btree_io_tree(struct extent_io_tree *tree)
80{
81 spin_lock(&tree->lock);
82 while (!RB_EMPTY_ROOT(&tree->state)) {
83 struct rb_node *node;
84 struct extent_state *state;
85
86 node = rb_first(&tree->state);
87 state = rb_entry(node, struct extent_state, rb_node);
88 rb_erase(&state->rb_node, &tree->state);
89 RB_CLEAR_NODE(&state->rb_node);
90 /*
91 * btree io trees aren't supposed to have tasks waiting for
92 * changes in the flags of extent states ever.
93 */
94 ASSERT(!waitqueue_active(&state->wq));
95 free_extent_state(state);
96 if (need_resched()) {
97 spin_unlock(&tree->lock);
98 cond_resched();
99 spin_lock(&tree->lock);
100 }
101 }
102 spin_unlock(&tree->lock);
103}
104
79static noinline void switch_commit_roots(struct btrfs_transaction *trans, 105static noinline void switch_commit_roots(struct btrfs_transaction *trans,
80 struct btrfs_fs_info *fs_info) 106 struct btrfs_fs_info *fs_info)
81{ 107{
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
89 root->commit_root = btrfs_root_node(root); 115 root->commit_root = btrfs_root_node(root);
90 if (is_fstree(root->objectid)) 116 if (is_fstree(root->objectid))
91 btrfs_unpin_free_ino(root); 117 btrfs_unpin_free_ino(root);
118 clear_btree_io_tree(&root->dirty_log_pages);
92 } 119 }
93 up_write(&fs_info->commit_root_sem); 120 up_write(&fs_info->commit_root_sem);
94} 121}
@@ -220,6 +247,7 @@ loop:
220 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 247 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
221 INIT_LIST_HEAD(&cur_trans->pending_chunks); 248 INIT_LIST_HEAD(&cur_trans->pending_chunks);
222 INIT_LIST_HEAD(&cur_trans->switch_commits); 249 INIT_LIST_HEAD(&cur_trans->switch_commits);
250 INIT_LIST_HEAD(&cur_trans->pending_ordered);
223 list_add_tail(&cur_trans->list, &fs_info->trans_list); 251 list_add_tail(&cur_trans->list, &fs_info->trans_list);
224 extent_io_tree_init(&cur_trans->dirty_pages, 252 extent_io_tree_init(&cur_trans->dirty_pages,
225 fs_info->btree_inode->i_mapping); 253 fs_info->btree_inode->i_mapping);
@@ -488,6 +516,7 @@ again:
488 h->sync = false; 516 h->sync = false;
489 INIT_LIST_HEAD(&h->qgroup_ref_list); 517 INIT_LIST_HEAD(&h->qgroup_ref_list);
490 INIT_LIST_HEAD(&h->new_bgs); 518 INIT_LIST_HEAD(&h->new_bgs);
519 INIT_LIST_HEAD(&h->ordered);
491 520
492 smp_mb(); 521 smp_mb();
493 if (cur_trans->state >= TRANS_STATE_BLOCKED && 522 if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -719,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
719 if (!list_empty(&trans->new_bgs)) 748 if (!list_empty(&trans->new_bgs))
720 btrfs_create_pending_block_groups(trans, root); 749 btrfs_create_pending_block_groups(trans, root);
721 750
751 if (!list_empty(&trans->ordered)) {
752 spin_lock(&info->trans_lock);
753 list_splice(&trans->ordered, &cur_trans->pending_ordered);
754 spin_unlock(&info->trans_lock);
755 }
756
722 trans->delayed_ref_updates = 0; 757 trans->delayed_ref_updates = 0;
723 if (!trans->sync) { 758 if (!trans->sync) {
724 must_run_delayed_refs = 759 must_run_delayed_refs =
@@ -828,17 +863,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
828 863
829 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 864 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
830 mark, &cached_state)) { 865 mark, &cached_state)) {
831 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 866 bool wait_writeback = false;
832 mark, &cached_state, GFP_NOFS); 867
833 cached_state = NULL; 868 err = convert_extent_bit(dirty_pages, start, end,
834 err = filemap_fdatawrite_range(mapping, start, end); 869 EXTENT_NEED_WAIT,
870 mark, &cached_state, GFP_NOFS);
871 /*
872 * convert_extent_bit can return -ENOMEM, which is most of the
873 * time a temporary error. So when it happens, ignore the error
874 * and wait for writeback of this range to finish - because we
875 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
876 * to btrfs_wait_marked_extents() would not know that writeback
877 * for this range started and therefore wouldn't wait for it to
878 * finish - we don't want to commit a superblock that points to
879 * btree nodes/leafs for which writeback hasn't finished yet
880 * (and without errors).
881 * We cleanup any entries left in the io tree when committing
882 * the transaction (through clear_btree_io_tree()).
883 */
884 if (err == -ENOMEM) {
885 err = 0;
886 wait_writeback = true;
887 }
888 if (!err)
889 err = filemap_fdatawrite_range(mapping, start, end);
835 if (err) 890 if (err)
836 werr = err; 891 werr = err;
892 else if (wait_writeback)
893 werr = filemap_fdatawait_range(mapping, start, end);
894 free_extent_state(cached_state);
895 cached_state = NULL;
837 cond_resched(); 896 cond_resched();
838 start = end + 1; 897 start = end + 1;
839 } 898 }
840 if (err)
841 werr = err;
842 return werr; 899 return werr;
843} 900}
844 901
@@ -862,11 +919,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
862 919
863 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 920 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
864 EXTENT_NEED_WAIT, &cached_state)) { 921 EXTENT_NEED_WAIT, &cached_state)) {
865 clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 922 /*
866 0, 0, &cached_state, GFP_NOFS); 923 * Ignore -ENOMEM errors returned by clear_extent_bit().
867 err = filemap_fdatawait_range(mapping, start, end); 924 * When committing the transaction, we'll remove any entries
925 * left in the io tree. For a log commit, we don't remove them
926 * after committing the log because the tree can be accessed
927 * concurrently - we do it only at transaction commit time when
928 * it's safe to do it (through clear_btree_io_tree()).
929 */
930 err = clear_extent_bit(dirty_pages, start, end,
931 EXTENT_NEED_WAIT,
932 0, 0, &cached_state, GFP_NOFS);
933 if (err == -ENOMEM)
934 err = 0;
935 if (!err)
936 err = filemap_fdatawait_range(mapping, start, end);
868 if (err) 937 if (err)
869 werr = err; 938 werr = err;
939 free_extent_state(cached_state);
940 cached_state = NULL;
870 cond_resched(); 941 cond_resched();
871 start = end + 1; 942 start = end + 1;
872 } 943 }
@@ -919,17 +990,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
919 return 0; 990 return 0;
920} 991}
921 992
922int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 993static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
923 struct btrfs_root *root) 994 struct btrfs_root *root)
924{ 995{
925 if (!trans || !trans->transaction) { 996 int ret;
926 struct inode *btree_inode; 997
927 btree_inode = root->fs_info->btree_inode; 998 ret = btrfs_write_and_wait_marked_extents(root,
928 return filemap_write_and_wait(btree_inode->i_mapping);
929 }
930 return btrfs_write_and_wait_marked_extents(root,
931 &trans->transaction->dirty_pages, 999 &trans->transaction->dirty_pages,
932 EXTENT_DIRTY); 1000 EXTENT_DIRTY);
1001 clear_btree_io_tree(&trans->transaction->dirty_pages);
1002
1003 return ret;
933} 1004}
934 1005
935/* 1006/*
@@ -1652,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
1652 btrfs_wait_ordered_roots(fs_info, -1); 1723 btrfs_wait_ordered_roots(fs_info, -1);
1653} 1724}
1654 1725
1726static inline void
1727btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
1728 struct btrfs_fs_info *fs_info)
1729{
1730 struct btrfs_ordered_extent *ordered;
1731
1732 spin_lock(&fs_info->trans_lock);
1733 while (!list_empty(&cur_trans->pending_ordered)) {
1734 ordered = list_first_entry(&cur_trans->pending_ordered,
1735 struct btrfs_ordered_extent,
1736 trans_list);
1737 list_del_init(&ordered->trans_list);
1738 spin_unlock(&fs_info->trans_lock);
1739
1740 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
1741 &ordered->flags));
1742 btrfs_put_ordered_extent(ordered);
1743 spin_lock(&fs_info->trans_lock);
1744 }
1745 spin_unlock(&fs_info->trans_lock);
1746}
1747
1655int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1748int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1656 struct btrfs_root *root) 1749 struct btrfs_root *root)
1657{ 1750{
@@ -1702,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1702 } 1795 }
1703 1796
1704 spin_lock(&root->fs_info->trans_lock); 1797 spin_lock(&root->fs_info->trans_lock);
1798 list_splice(&trans->ordered, &cur_trans->pending_ordered);
1705 if (cur_trans->state >= TRANS_STATE_COMMIT_START) { 1799 if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
1706 spin_unlock(&root->fs_info->trans_lock); 1800 spin_unlock(&root->fs_info->trans_lock);
1707 atomic_inc(&cur_trans->use_count); 1801 atomic_inc(&cur_trans->use_count);
@@ -1754,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1754 1848
1755 btrfs_wait_delalloc_flush(root->fs_info); 1849 btrfs_wait_delalloc_flush(root->fs_info);
1756 1850
1851 btrfs_wait_pending_ordered(cur_trans, root->fs_info);
1852
1757 btrfs_scrub_pause(root); 1853 btrfs_scrub_pause(root);
1758 /* 1854 /*
1759 * Ok now we need to make sure to block out any other joins while we 1855 * Ok now we need to make sure to block out any other joins while we
@@ -1842,13 +1938,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1842 } 1938 }
1843 1939
1844 /* 1940 /*
1845 * Since the transaction is done, we should set the inode map cache flag 1941 * Since the transaction is done, we can apply the pending changes
1846 * before any other comming transaction. 1942 * before the next transaction.
1847 */ 1943 */
1848 if (btrfs_test_opt(root, CHANGE_INODE_CACHE)) 1944 btrfs_apply_pending_changes(root->fs_info);
1849 btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
1850 else
1851 btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
1852 1945
1853 /* commit_fs_roots gets rid of all the tree log roots, it is now 1946 /* commit_fs_roots gets rid of all the tree log roots, it is now
1854 * safe to free the root of tree log roots 1947 * safe to free the root of tree log roots
@@ -2019,3 +2112,32 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
2019 2112
2020 return (ret < 0) ? 0 : 1; 2113 return (ret < 0) ? 0 : 1;
2021} 2114}
2115
2116void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
2117{
2118 unsigned long prev;
2119 unsigned long bit;
2120
2121 prev = cmpxchg(&fs_info->pending_changes, 0, 0);
2122 if (!prev)
2123 return;
2124
2125 bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
2126 if (prev & bit)
2127 btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
2128 prev &= ~bit;
2129
2130 bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
2131 if (prev & bit)
2132 btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
2133 prev &= ~bit;
2134
2135 bit = 1 << BTRFS_PENDING_COMMIT;
2136 if (prev & bit)
2137 btrfs_debug(fs_info, "pending commit done");
2138 prev &= ~bit;
2139
2140 if (prev)
2141 btrfs_warn(fs_info,
2142 "unknown pending changes left 0x%lx, ignoring", prev);
2143}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d8f40e1a5d2d..00ed29c4b3f9 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -56,6 +56,7 @@ struct btrfs_transaction {
56 wait_queue_head_t commit_wait; 56 wait_queue_head_t commit_wait;
57 struct list_head pending_snapshots; 57 struct list_head pending_snapshots;
58 struct list_head pending_chunks; 58 struct list_head pending_chunks;
59 struct list_head pending_ordered;
59 struct list_head switch_commits; 60 struct list_head switch_commits;
60 struct btrfs_delayed_ref_root delayed_refs; 61 struct btrfs_delayed_ref_root delayed_refs;
61 int aborted; 62 int aborted;
@@ -105,6 +106,7 @@ struct btrfs_trans_handle {
105 */ 106 */
106 struct btrfs_root *root; 107 struct btrfs_root *root;
107 struct seq_list delayed_ref_elem; 108 struct seq_list delayed_ref_elem;
109 struct list_head ordered;
108 struct list_head qgroup_ref_list; 110 struct list_head qgroup_ref_list;
109 struct list_head new_bgs; 111 struct list_head new_bgs;
110}; 112};
@@ -145,8 +147,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
145 struct btrfs_root *root); 147 struct btrfs_root *root);
146struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); 148struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
147int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); 149int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
148int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
149 struct btrfs_root *root);
150 150
151void btrfs_add_dead_root(struct btrfs_root *root); 151void btrfs_add_dead_root(struct btrfs_root *root);
152int btrfs_defrag_root(struct btrfs_root *root); 152int btrfs_defrag_root(struct btrfs_root *root);
@@ -170,4 +170,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
170int btrfs_transaction_blocked(struct btrfs_fs_info *info); 170int btrfs_transaction_blocked(struct btrfs_fs_info *info);
171int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 171int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
172void btrfs_put_transaction(struct btrfs_transaction *transaction); 172void btrfs_put_transaction(struct btrfs_transaction *transaction);
173void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
174
173#endif 175#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 286213cec861..9a02da16f2be 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2599,12 +2599,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2599 index2 = root_log_ctx.log_transid % 2; 2599 index2 = root_log_ctx.log_transid % 2;
2600 if (atomic_read(&log_root_tree->log_commit[index2])) { 2600 if (atomic_read(&log_root_tree->log_commit[index2])) {
2601 blk_finish_plug(&plug); 2601 blk_finish_plug(&plug);
2602 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2602 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
2603 mark);
2604 btrfs_wait_logged_extents(trans, log, log_transid);
2603 wait_log_commit(trans, log_root_tree, 2605 wait_log_commit(trans, log_root_tree,
2604 root_log_ctx.log_transid); 2606 root_log_ctx.log_transid);
2605 btrfs_free_logged_extents(log, log_transid);
2606 mutex_unlock(&log_root_tree->log_mutex); 2607 mutex_unlock(&log_root_tree->log_mutex);
2607 ret = root_log_ctx.log_ret; 2608 if (!ret)
2609 ret = root_log_ctx.log_ret;
2608 goto out; 2610 goto out;
2609 } 2611 }
2610 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2612 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
@@ -2641,11 +2643,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2641 mutex_unlock(&log_root_tree->log_mutex); 2643 mutex_unlock(&log_root_tree->log_mutex);
2642 goto out_wake_log_root; 2644 goto out_wake_log_root;
2643 } 2645 }
2644 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2646 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2645 btrfs_wait_marked_extents(log_root_tree, 2647 if (!ret)
2646 &log_root_tree->dirty_log_pages, 2648 ret = btrfs_wait_marked_extents(log_root_tree,
2647 EXTENT_NEW | EXTENT_DIRTY); 2649 &log_root_tree->dirty_log_pages,
2648 btrfs_wait_logged_extents(log, log_transid); 2650 EXTENT_NEW | EXTENT_DIRTY);
2651 if (ret) {
2652 btrfs_set_log_full_commit(root->fs_info, trans);
2653 btrfs_free_logged_extents(log, log_transid);
2654 mutex_unlock(&log_root_tree->log_mutex);
2655 goto out_wake_log_root;
2656 }
2657 btrfs_wait_logged_extents(trans, log, log_transid);
2649 2658
2650 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2659 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2651 log_root_tree->node->start); 2660 log_root_tree->node->start);
@@ -3626,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
3626 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3635 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
3627 3636
3628 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3637 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3638 /*
3639 * Clear the AS_EIO/AS_ENOSPC flags from the inode's
3640 * i_mapping flags, so that the next fsync won't get
3641 * an outdated io error too.
3642 */
3643 btrfs_inode_check_errors(inode);
3629 *ordered_io_error = true; 3644 *ordered_io_error = true;
3630 break; 3645 break;
3631 } 3646 }
@@ -3766,7 +3781,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3766 fi = btrfs_item_ptr(leaf, path->slots[0], 3781 fi = btrfs_item_ptr(leaf, path->slots[0],
3767 struct btrfs_file_extent_item); 3782 struct btrfs_file_extent_item);
3768 3783
3769 btrfs_set_token_file_extent_generation(leaf, fi, em->generation, 3784 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
3770 &token); 3785 &token);
3771 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3786 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3772 btrfs_set_token_file_extent_type(leaf, fi, 3787 btrfs_set_token_file_extent_type(leaf, fi,
@@ -3963,7 +3978,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3963 3978
3964 mutex_lock(&BTRFS_I(inode)->log_mutex); 3979 mutex_lock(&BTRFS_I(inode)->log_mutex);
3965 3980
3966 btrfs_get_logged_extents(inode, &logged_list); 3981 btrfs_get_logged_extents(inode, &logged_list, start, end);
3967 3982
3968 /* 3983 /*
3969 * a brute force approach to making sure we get the most uptodate 3984 * a brute force approach to making sure we get the most uptodate
@@ -4089,6 +4104,21 @@ log_extents:
4089 btrfs_release_path(path); 4104 btrfs_release_path(path);
4090 btrfs_release_path(dst_path); 4105 btrfs_release_path(dst_path);
4091 if (fast_search) { 4106 if (fast_search) {
4107 /*
4108 * Some ordered extents started by fsync might have completed
4109 * before we collected the ordered extents in logged_list, which
4110 * means they're gone, not in our logged_list nor in the inode's
4111 * ordered tree. We want the application/user space to know an
4112 * error happened while attempting to persist file data so that
4113 * it can take proper action. If such error happened, we leave
4114 * without writing to the log tree and the fsync must report the
4115 * file data write error and not commit the current transaction.
4116 */
4117 err = btrfs_inode_check_errors(inode);
4118 if (err) {
4119 ctx->io_err = err;
4120 goto out_unlock;
4121 }
4092 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4122 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4093 &logged_list, ctx); 4123 &logged_list, ctx);
4094 if (ret) { 4124 if (ret) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d47289c715c8..0144790e296e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
53DEFINE_MUTEX(uuid_mutex); 53DEFINE_MUTEX(uuid_mutex);
54static LIST_HEAD(fs_uuids); 54static LIST_HEAD(fs_uuids);
55 55
56static void lock_chunks(struct btrfs_root *root)
57{
58 mutex_lock(&root->fs_info->chunk_mutex);
59}
60
61static void unlock_chunks(struct btrfs_root *root)
62{
63 mutex_unlock(&root->fs_info->chunk_mutex);
64}
65
66static struct btrfs_fs_devices *__alloc_fs_devices(void) 56static struct btrfs_fs_devices *__alloc_fs_devices(void)
67{ 57{
68 struct btrfs_fs_devices *fs_devs; 58 struct btrfs_fs_devices *fs_devs;
@@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
1068 u64 *start, u64 len) 1058 u64 *start, u64 len)
1069{ 1059{
1070 struct extent_map *em; 1060 struct extent_map *em;
1061 struct list_head *search_list = &trans->transaction->pending_chunks;
1071 int ret = 0; 1062 int ret = 0;
1072 1063
1073 list_for_each_entry(em, &trans->transaction->pending_chunks, list) { 1064again:
1065 list_for_each_entry(em, search_list, list) {
1074 struct map_lookup *map; 1066 struct map_lookup *map;
1075 int i; 1067 int i;
1076 1068
@@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
1087 ret = 1; 1079 ret = 1;
1088 } 1080 }
1089 } 1081 }
1082 if (search_list == &trans->transaction->pending_chunks) {
1083 search_list = &trans->root->fs_info->pinned_chunks;
1084 goto again;
1085 }
1090 1086
1091 return ret; 1087 return ret;
1092} 1088}
@@ -1800,8 +1796,8 @@ error_undo:
1800 goto error_brelse; 1796 goto error_brelse;
1801} 1797}
1802 1798
1803void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 1799void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
1804 struct btrfs_device *srcdev) 1800 struct btrfs_device *srcdev)
1805{ 1801{
1806 struct btrfs_fs_devices *fs_devices; 1802 struct btrfs_fs_devices *fs_devices;
1807 1803
@@ -1829,6 +1825,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1829 1825
1830 if (srcdev->bdev) 1826 if (srcdev->bdev)
1831 fs_devices->open_devices--; 1827 fs_devices->open_devices--;
1828}
1829
1830void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
1831 struct btrfs_device *srcdev)
1832{
1833 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
1832 1834
1833 call_rcu(&srcdev->rcu, free_device); 1835 call_rcu(&srcdev->rcu, free_device);
1834 1836
@@ -2647,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2647 } 2649 }
2648 } 2650 }
2649 2651
2650 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 2652 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
2651 if (ret) { 2653 if (ret) {
2652 btrfs_abort_transaction(trans, extent_root, ret); 2654 btrfs_abort_transaction(trans, extent_root, ret);
2653 goto out; 2655 goto out;
2654 } 2656 }
2655 2657
2656 write_lock(&em_tree->lock);
2657 remove_extent_mapping(em_tree, em);
2658 write_unlock(&em_tree->lock);
2659
2660 /* once for the tree */
2661 free_extent_map(em);
2662out: 2658out:
2663 /* once for us */ 2659 /* once for us */
2664 free_extent_map(em); 2660 free_extent_map(em);
@@ -4505,6 +4501,8 @@ error_del_extent:
4505 free_extent_map(em); 4501 free_extent_map(em);
4506 /* One for the tree reference */ 4502 /* One for the tree reference */
4507 free_extent_map(em); 4503 free_extent_map(em);
4504 /* One for the pending_chunks list reference */
4505 free_extent_map(em);
4508error: 4506error:
4509 kfree(devices_info); 4507 kfree(devices_info);
4510 return ret; 4508 return ret;
@@ -4881,13 +4879,15 @@ static inline int parity_smaller(u64 a, u64 b)
4881static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) 4879static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4882{ 4880{
4883 struct btrfs_bio_stripe s; 4881 struct btrfs_bio_stripe s;
4882 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
4884 int i; 4883 int i;
4885 u64 l; 4884 u64 l;
4886 int again = 1; 4885 int again = 1;
4886 int m;
4887 4887
4888 while (again) { 4888 while (again) {
4889 again = 0; 4889 again = 0;
4890 for (i = 0; i < bbio->num_stripes - 1; i++) { 4890 for (i = 0; i < real_stripes - 1; i++) {
4891 if (parity_smaller(raid_map[i], raid_map[i+1])) { 4891 if (parity_smaller(raid_map[i], raid_map[i+1])) {
4892 s = bbio->stripes[i]; 4892 s = bbio->stripes[i];
4893 l = raid_map[i]; 4893 l = raid_map[i];
@@ -4895,6 +4895,14 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4895 raid_map[i] = raid_map[i+1]; 4895 raid_map[i] = raid_map[i+1];
4896 bbio->stripes[i+1] = s; 4896 bbio->stripes[i+1] = s;
4897 raid_map[i+1] = l; 4897 raid_map[i+1] = l;
4898
4899 if (bbio->tgtdev_map) {
4900 m = bbio->tgtdev_map[i];
4901 bbio->tgtdev_map[i] =
4902 bbio->tgtdev_map[i + 1];
4903 bbio->tgtdev_map[i + 1] = m;
4904 }
4905
4898 again = 1; 4906 again = 1;
4899 } 4907 }
4900 } 4908 }
@@ -4923,6 +4931,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4923 int ret = 0; 4931 int ret = 0;
4924 int num_stripes; 4932 int num_stripes;
4925 int max_errors = 0; 4933 int max_errors = 0;
4934 int tgtdev_indexes = 0;
4926 struct btrfs_bio *bbio = NULL; 4935 struct btrfs_bio *bbio = NULL;
4927 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 4936 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4928 int dev_replace_is_ongoing = 0; 4937 int dev_replace_is_ongoing = 0;
@@ -5161,15 +5170,14 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5161 BTRFS_BLOCK_GROUP_RAID6)) { 5170 BTRFS_BLOCK_GROUP_RAID6)) {
5162 u64 tmp; 5171 u64 tmp;
5163 5172
5164 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) 5173 if (raid_map_ret &&
5165 && raid_map_ret) { 5174 ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
5175 mirror_num > 1)) {
5166 int i, rot; 5176 int i, rot;
5167 5177
5168 /* push stripe_nr back to the start of the full stripe */ 5178 /* push stripe_nr back to the start of the full stripe */
5169 stripe_nr = raid56_full_stripe_start; 5179 stripe_nr = raid56_full_stripe_start;
5170 do_div(stripe_nr, stripe_len); 5180 do_div(stripe_nr, stripe_len * nr_data_stripes(map));
5171
5172 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
5173 5181
5174 /* RAID[56] write or recovery. Return all stripes */ 5182 /* RAID[56] write or recovery. Return all stripes */
5175 num_stripes = map->num_stripes; 5183 num_stripes = map->num_stripes;
@@ -5235,14 +5243,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5235 num_alloc_stripes <<= 1; 5243 num_alloc_stripes <<= 1;
5236 if (rw & REQ_GET_READ_MIRRORS) 5244 if (rw & REQ_GET_READ_MIRRORS)
5237 num_alloc_stripes++; 5245 num_alloc_stripes++;
5246 tgtdev_indexes = num_stripes;
5238 } 5247 }
5239 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); 5248
5249 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
5250 GFP_NOFS);
5240 if (!bbio) { 5251 if (!bbio) {
5241 kfree(raid_map); 5252 kfree(raid_map);
5242 ret = -ENOMEM; 5253 ret = -ENOMEM;
5243 goto out; 5254 goto out;
5244 } 5255 }
5245 atomic_set(&bbio->error, 0); 5256 atomic_set(&bbio->error, 0);
5257 if (dev_replace_is_ongoing)
5258 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
5246 5259
5247 if (rw & REQ_DISCARD) { 5260 if (rw & REQ_DISCARD) {
5248 int factor = 0; 5261 int factor = 0;
@@ -5327,6 +5340,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5327 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 5340 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5328 max_errors = btrfs_chunk_max_errors(map); 5341 max_errors = btrfs_chunk_max_errors(map);
5329 5342
5343 tgtdev_indexes = 0;
5330 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 5344 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
5331 dev_replace->tgtdev != NULL) { 5345 dev_replace->tgtdev != NULL) {
5332 int index_where_to_add; 5346 int index_where_to_add;
@@ -5355,8 +5369,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5355 new->physical = old->physical; 5369 new->physical = old->physical;
5356 new->length = old->length; 5370 new->length = old->length;
5357 new->dev = dev_replace->tgtdev; 5371 new->dev = dev_replace->tgtdev;
5372 bbio->tgtdev_map[i] = index_where_to_add;
5358 index_where_to_add++; 5373 index_where_to_add++;
5359 max_errors++; 5374 max_errors++;
5375 tgtdev_indexes++;
5360 } 5376 }
5361 } 5377 }
5362 num_stripes = index_where_to_add; 5378 num_stripes = index_where_to_add;
@@ -5402,7 +5418,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5402 tgtdev_stripe->length = 5418 tgtdev_stripe->length =
5403 bbio->stripes[index_srcdev].length; 5419 bbio->stripes[index_srcdev].length;
5404 tgtdev_stripe->dev = dev_replace->tgtdev; 5420 tgtdev_stripe->dev = dev_replace->tgtdev;
5421 bbio->tgtdev_map[index_srcdev] = num_stripes;
5405 5422
5423 tgtdev_indexes++;
5406 num_stripes++; 5424 num_stripes++;
5407 } 5425 }
5408 } 5426 }
@@ -5412,6 +5430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5412 bbio->num_stripes = num_stripes; 5430 bbio->num_stripes = num_stripes;
5413 bbio->max_errors = max_errors; 5431 bbio->max_errors = max_errors;
5414 bbio->mirror_num = mirror_num; 5432 bbio->mirror_num = mirror_num;
5433 bbio->num_tgtdevs = tgtdev_indexes;
5415 5434
5416 /* 5435 /*
5417 * this is the case that REQ_READ && dev_replace_is_ongoing && 5436 * this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -5443,6 +5462,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5443 mirror_num, NULL); 5462 mirror_num, NULL);
5444} 5463}
5445 5464
5465/* For Scrub/replace */
5466int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
5467 u64 logical, u64 *length,
5468 struct btrfs_bio **bbio_ret, int mirror_num,
5469 u64 **raid_map_ret)
5470{
5471 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
5472 mirror_num, raid_map_ret);
5473}
5474
5446int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 5475int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5447 u64 chunk_start, u64 physical, u64 devid, 5476 u64 chunk_start, u64 physical, u64 devid,
5448 u64 **logical, int *naddrs, int *stripe_len) 5477 u64 **logical, int *naddrs, int *stripe_len)
@@ -5812,12 +5841,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5812 } else { 5841 } else {
5813 ret = raid56_parity_recover(root, bio, bbio, 5842 ret = raid56_parity_recover(root, bio, bbio,
5814 raid_map, map_length, 5843 raid_map, map_length,
5815 mirror_num); 5844 mirror_num, 1);
5816 } 5845 }
5817 /* 5846
5818 * FIXME, replace dosen't support raid56 yet, please fix
5819 * it in the future.
5820 */
5821 btrfs_bio_counter_dec(root->fs_info); 5847 btrfs_bio_counter_dec(root->fs_info);
5822 return ret; 5848 return ret;
5823 } 5849 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 08980fa23039..d6fe73c0f4a2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -292,7 +292,7 @@ struct btrfs_bio_stripe {
292struct btrfs_bio; 292struct btrfs_bio;
293typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); 293typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
294 294
295#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1 295#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0)
296 296
297struct btrfs_bio { 297struct btrfs_bio {
298 atomic_t stripes_pending; 298 atomic_t stripes_pending;
@@ -305,6 +305,8 @@ struct btrfs_bio {
305 int max_errors; 305 int max_errors;
306 int num_stripes; 306 int num_stripes;
307 int mirror_num; 307 int mirror_num;
308 int num_tgtdevs;
309 int *tgtdev_map;
308 struct btrfs_bio_stripe stripes[]; 310 struct btrfs_bio_stripe stripes[];
309}; 311};
310 312
@@ -387,12 +389,18 @@ struct btrfs_balance_control {
387int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 389int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
388 u64 end, u64 *length); 390 u64 end, u64 *length);
389 391
390#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ 392#define btrfs_bio_size(total_stripes, real_stripes) \
391 (sizeof(struct btrfs_bio_stripe) * (n))) 393 (sizeof(struct btrfs_bio) + \
394 (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \
395 (sizeof(int) * (real_stripes)))
392 396
393int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 397int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
394 u64 logical, u64 *length, 398 u64 logical, u64 *length,
395 struct btrfs_bio **bbio_ret, int mirror_num); 399 struct btrfs_bio **bbio_ret, int mirror_num);
400int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
401 u64 logical, u64 *length,
402 struct btrfs_bio **bbio_ret, int mirror_num,
403 u64 **raid_map_ret);
396int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 404int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
397 u64 chunk_start, u64 physical, u64 devid, 405 u64 chunk_start, u64 physical, u64 devid,
398 u64 **logical, int *naddrs, int *stripe_len); 406 u64 **logical, int *naddrs, int *stripe_len);
@@ -448,8 +456,10 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
448int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 456int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
449int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 457int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
450 struct btrfs_fs_info *fs_info); 458 struct btrfs_fs_info *fs_info);
451void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 459void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
452 struct btrfs_device *srcdev); 460 struct btrfs_device *srcdev);
461void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
462 struct btrfs_device *srcdev);
453void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 463void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
454 struct btrfs_device *tgtdev); 464 struct btrfs_device *tgtdev);
455void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 465void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
@@ -513,4 +523,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
513void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); 523void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
514void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, 524void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
515 struct btrfs_transaction *transaction); 525 struct btrfs_transaction *transaction);
526
527static inline void lock_chunks(struct btrfs_root *root)
528{
529 mutex_lock(&root->fs_info->chunk_mutex);
530}
531
532static inline void unlock_chunks(struct btrfs_root *root)
533{
534 mutex_unlock(&root->fs_info->chunk_mutex);
535}
536
537
516#endif 538#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index dcf20131fbe4..47b19465f0dc 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -29,6 +29,7 @@
29#include "xattr.h" 29#include "xattr.h"
30#include "disk-io.h" 30#include "disk-io.h"
31#include "props.h" 31#include "props.h"
32#include "locking.h"
32 33
33 34
34ssize_t __btrfs_getxattr(struct inode *inode, const char *name, 35ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
@@ -91,7 +92,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
91 struct inode *inode, const char *name, 92 struct inode *inode, const char *name,
92 const void *value, size_t size, int flags) 93 const void *value, size_t size, int flags)
93{ 94{
94 struct btrfs_dir_item *di; 95 struct btrfs_dir_item *di = NULL;
95 struct btrfs_root *root = BTRFS_I(inode)->root; 96 struct btrfs_root *root = BTRFS_I(inode)->root;
96 struct btrfs_path *path; 97 struct btrfs_path *path;
97 size_t name_len = strlen(name); 98 size_t name_len = strlen(name);
@@ -103,84 +104,119 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
103 path = btrfs_alloc_path(); 104 path = btrfs_alloc_path();
104 if (!path) 105 if (!path)
105 return -ENOMEM; 106 return -ENOMEM;
107 path->skip_release_on_error = 1;
108
109 if (!value) {
110 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
111 name, name_len, -1);
112 if (!di && (flags & XATTR_REPLACE))
113 ret = -ENODATA;
114 else if (di)
115 ret = btrfs_delete_one_dir_name(trans, root, path, di);
116 goto out;
117 }
106 118
119 /*
120 * For a replace we can't just do the insert blindly.
121 * Do a lookup first (read-only btrfs_search_slot), and return if xattr
122 * doesn't exist. If it exists, fall down below to the insert/replace
123 * path - we can't race with a concurrent xattr delete, because the VFS
124 * locks the inode's i_mutex before calling setxattr or removexattr.
125 */
107 if (flags & XATTR_REPLACE) { 126 if (flags & XATTR_REPLACE) {
108 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, 127 ASSERT(mutex_is_locked(&inode->i_mutex));
109 name_len, -1); 128 di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
110 if (IS_ERR(di)) { 129 name, name_len, 0);
111 ret = PTR_ERR(di); 130 if (!di) {
112 goto out;
113 } else if (!di) {
114 ret = -ENODATA; 131 ret = -ENODATA;
115 goto out; 132 goto out;
116 } 133 }
117 ret = btrfs_delete_one_dir_name(trans, root, path, di);
118 if (ret)
119 goto out;
120 btrfs_release_path(path); 134 btrfs_release_path(path);
135 di = NULL;
136 }
121 137
138 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
139 name, name_len, value, size);
140 if (ret == -EOVERFLOW) {
122 /* 141 /*
123 * remove the attribute 142 * We have an existing item in a leaf, split_leaf couldn't
143 * expand it. That item might have or not a dir_item that
144 * matches our target xattr, so lets check.
124 */ 145 */
125 if (!value) 146 ret = 0;
126 goto out; 147 btrfs_assert_tree_locked(path->nodes[0]);
127 } else { 148 di = btrfs_match_dir_item_name(root, path, name, name_len);
128 di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), 149 if (!di && !(flags & XATTR_REPLACE)) {
129 name, name_len, 0); 150 ret = -ENOSPC;
130 if (IS_ERR(di)) {
131 ret = PTR_ERR(di);
132 goto out; 151 goto out;
133 } 152 }
134 if (!di && !value) 153 } else if (ret == -EEXIST) {
135 goto out; 154 ret = 0;
136 btrfs_release_path(path); 155 di = btrfs_match_dir_item_name(root, path, name, name_len);
156 ASSERT(di); /* logic error */
157 } else if (ret) {
158 goto out;
137 } 159 }
138 160
139again: 161 if (di && (flags & XATTR_CREATE)) {
140 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
141 name, name_len, value, size);
142 /*
143 * If we're setting an xattr to a new value but the new value is say
144 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
145 * back from split_leaf. This is because it thinks we'll be extending
146 * the existing item size, but we're asking for enough space to add the
147 * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
148 * the rest of the function figure it out.
149 */
150 if (ret == -EOVERFLOW)
151 ret = -EEXIST; 162 ret = -EEXIST;
163 goto out;
164 }
152 165
153 if (ret == -EEXIST) { 166 if (di) {
154 if (flags & XATTR_CREATE)
155 goto out;
156 /* 167 /*
157 * We can't use the path we already have since we won't have the 168 * We're doing a replace, and it must be atomic, that is, at
158 * proper locking for a delete, so release the path and 169 * any point in time we have either the old or the new xattr
159 * re-lookup to delete the thing. 170 * value in the tree. We don't want readers (getxattr and
171 * listxattrs) to miss a value, this is specially important
172 * for ACLs.
160 */ 173 */
161 btrfs_release_path(path); 174 const int slot = path->slots[0];
162 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), 175 struct extent_buffer *leaf = path->nodes[0];
163 name, name_len, -1); 176 const u16 old_data_len = btrfs_dir_data_len(leaf, di);
164 if (IS_ERR(di)) { 177 const u32 item_size = btrfs_item_size_nr(leaf, slot);
165 ret = PTR_ERR(di); 178 const u32 data_size = sizeof(*di) + name_len + size;
166 goto out; 179 struct btrfs_item *item;
167 } else if (!di) { 180 unsigned long data_ptr;
168 /* Shouldn't happen but just in case... */ 181 char *ptr;
169 btrfs_release_path(path); 182
170 goto again; 183 if (size > old_data_len) {
184 if (btrfs_leaf_free_space(root, leaf) <
185 (size - old_data_len)) {
186 ret = -ENOSPC;
187 goto out;
188 }
171 } 189 }
172 190
173 ret = btrfs_delete_one_dir_name(trans, root, path, di); 191 if (old_data_len + name_len + sizeof(*di) == item_size) {
174 if (ret) 192 /* No other xattrs packed in the same leaf item. */
175 goto out; 193 if (size > old_data_len)
194 btrfs_extend_item(root, path,
195 size - old_data_len);
196 else if (size < old_data_len)
197 btrfs_truncate_item(root, path, data_size, 1);
198 } else {
199 /* There are other xattrs packed in the same item. */
200 ret = btrfs_delete_one_dir_name(trans, root, path, di);
201 if (ret)
202 goto out;
203 btrfs_extend_item(root, path, data_size);
204 }
176 205
206 item = btrfs_item_nr(slot);
207 ptr = btrfs_item_ptr(leaf, slot, char);
208 ptr += btrfs_item_size(leaf, item) - data_size;
209 di = (struct btrfs_dir_item *)ptr;
210 btrfs_set_dir_data_len(leaf, di, size);
211 data_ptr = ((unsigned long)(di + 1)) + name_len;
212 write_extent_buffer(leaf, value, data_ptr, size);
213 btrfs_mark_buffer_dirty(leaf);
214 } else {
177 /* 215 /*
178 * We have a value to set, so go back and try to insert it now. 216 * Insert, and we had space for the xattr, so path->slots[0] is
217 * where our xattr dir_item is and btrfs_insert_xattr_item()
218 * filled it.
179 */ 219 */
180 if (value) {
181 btrfs_release_path(path);
182 goto again;
183 }
184 } 220 }
185out: 221out:
186 btrfs_free_path(path); 222 btrfs_free_path(path);