aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/check-integrity.c163
-rw-r--r--fs/btrfs/compression.c18
-rw-r--r--fs/btrfs/ctree.c2
-rw-r--r--fs/btrfs/ctree.h85
-rw-r--r--fs/btrfs/dev-replace.c32
-rw-r--r--fs/btrfs/dir-item.c10
-rw-r--r--fs/btrfs/disk-io.c49
-rw-r--r--fs/btrfs/extent-tree.c211
-rw-r--r--fs/btrfs/extent_io.c41
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file.c51
-rw-r--r--fs/btrfs/free-space-cache.c117
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c152
-rw-r--r--fs/btrfs/ioctl.c36
-rw-r--r--fs/btrfs/ordered-data.c49
-rw-r--r--fs/btrfs/ordered-data.h12
-rw-r--r--fs/btrfs/raid56.c763
-rw-r--r--fs/btrfs/raid56.h16
-rw-r--r--fs/btrfs/scrub.c893
-rw-r--r--fs/btrfs/send.c49
-rw-r--r--fs/btrfs/super.c94
-rw-r--r--fs/btrfs/sysfs.c34
-rw-r--r--fs/btrfs/transaction.c166
-rw-r--r--fs/btrfs/transaction.h6
-rw-r--r--fs/btrfs/tree-log.c50
-rw-r--r--fs/btrfs/volumes.c90
-rw-r--r--fs/btrfs/volumes.h32
-rw-r--r--fs/btrfs/xattr.c150
31 files changed, 2739 insertions, 641 deletions
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index cb7f3fe9c9f6..d897ef803b3b 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -94,6 +94,7 @@
94#include <linux/mutex.h> 94#include <linux/mutex.h>
95#include <linux/genhd.h> 95#include <linux/genhd.h>
96#include <linux/blkdev.h> 96#include <linux/blkdev.h>
97#include <linux/vmalloc.h>
97#include "ctree.h" 98#include "ctree.h"
98#include "disk-io.h" 99#include "disk-io.h"
99#include "hash.h" 100#include "hash.h"
@@ -326,9 +327,6 @@ static int btrfsic_handle_extent_data(struct btrfsic_state *state,
326static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, 327static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
327 struct btrfsic_block_data_ctx *block_ctx_out, 328 struct btrfsic_block_data_ctx *block_ctx_out,
328 int mirror_num); 329 int mirror_num);
329static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
330 u32 len, struct block_device *bdev,
331 struct btrfsic_block_data_ctx *block_ctx_out);
332static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); 330static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
333static int btrfsic_read_block(struct btrfsic_state *state, 331static int btrfsic_read_block(struct btrfsic_state *state,
334 struct btrfsic_block_data_ctx *block_ctx); 332 struct btrfsic_block_data_ctx *block_ctx);
@@ -1326,24 +1324,25 @@ static int btrfsic_create_link_to_next_block(
1326 l = NULL; 1324 l = NULL;
1327 next_block->generation = BTRFSIC_GENERATION_UNKNOWN; 1325 next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
1328 } else { 1326 } else {
1329 if (next_block->logical_bytenr != next_bytenr && 1327 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
1330 !(!next_block->is_metadata && 1328 if (next_block->logical_bytenr != next_bytenr &&
1331 0 == next_block->logical_bytenr)) { 1329 !(!next_block->is_metadata &&
1332 printk(KERN_INFO 1330 0 == next_block->logical_bytenr))
1333 "Referenced block @%llu (%s/%llu/%d)" 1331 printk(KERN_INFO
1334 " found in hash table, %c," 1332 "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
1335 " bytenr mismatch (!= stored %llu).\n", 1333 next_bytenr, next_block_ctx->dev->name,
1336 next_bytenr, next_block_ctx->dev->name, 1334 next_block_ctx->dev_bytenr, *mirror_nump,
1337 next_block_ctx->dev_bytenr, *mirror_nump, 1335 btrfsic_get_block_type(state,
1338 btrfsic_get_block_type(state, next_block), 1336 next_block),
1339 next_block->logical_bytenr); 1337 next_block->logical_bytenr);
1340 } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 1338 else
1341 printk(KERN_INFO 1339 printk(KERN_INFO
1342 "Referenced block @%llu (%s/%llu/%d)" 1340 "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
1343 " found in hash table, %c.\n", 1341 next_bytenr, next_block_ctx->dev->name,
1344 next_bytenr, next_block_ctx->dev->name, 1342 next_block_ctx->dev_bytenr, *mirror_nump,
1345 next_block_ctx->dev_bytenr, *mirror_nump, 1343 btrfsic_get_block_type(state,
1346 btrfsic_get_block_type(state, next_block)); 1344 next_block));
1345 }
1347 next_block->logical_bytenr = next_bytenr; 1346 next_block->logical_bytenr = next_bytenr;
1348 1347
1349 next_block->mirror_num = *mirror_nump; 1348 next_block->mirror_num = *mirror_nump;
@@ -1529,7 +1528,9 @@ static int btrfsic_handle_extent_data(
1529 return -1; 1528 return -1;
1530 } 1529 }
1531 if (!block_was_created) { 1530 if (!block_was_created) {
1532 if (next_block->logical_bytenr != next_bytenr && 1531 if ((state->print_mask &
1532 BTRFSIC_PRINT_MASK_VERBOSE) &&
1533 next_block->logical_bytenr != next_bytenr &&
1533 !(!next_block->is_metadata && 1534 !(!next_block->is_metadata &&
1534 0 == next_block->logical_bytenr)) { 1535 0 == next_block->logical_bytenr)) {
1535 printk(KERN_INFO 1536 printk(KERN_INFO
@@ -1607,25 +1608,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1607 return ret; 1608 return ret;
1608} 1609}
1609 1610
1610static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
1611 u32 len, struct block_device *bdev,
1612 struct btrfsic_block_data_ctx *block_ctx_out)
1613{
1614 block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
1615 block_ctx_out->dev_bytenr = bytenr;
1616 block_ctx_out->start = bytenr;
1617 block_ctx_out->len = len;
1618 block_ctx_out->datav = NULL;
1619 block_ctx_out->pagev = NULL;
1620 block_ctx_out->mem_to_free = NULL;
1621 if (NULL != block_ctx_out->dev) {
1622 return 0;
1623 } else {
1624 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
1625 return -ENXIO;
1626 }
1627}
1628
1629static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) 1611static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
1630{ 1612{
1631 if (block_ctx->mem_to_free) { 1613 if (block_ctx->mem_to_free) {
@@ -1901,25 +1883,26 @@ again:
1901 dev_state, 1883 dev_state,
1902 dev_bytenr); 1884 dev_bytenr);
1903 } 1885 }
1904 if (block->logical_bytenr != bytenr && 1886 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
1905 !(!block->is_metadata && 1887 if (block->logical_bytenr != bytenr &&
1906 block->logical_bytenr == 0)) 1888 !(!block->is_metadata &&
1907 printk(KERN_INFO 1889 block->logical_bytenr == 0))
1908 "Written block @%llu (%s/%llu/%d)" 1890 printk(KERN_INFO
1909 " found in hash table, %c," 1891 "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
1910 " bytenr mismatch" 1892 bytenr, dev_state->name,
1911 " (!= stored %llu).\n", 1893 dev_bytenr,
1912 bytenr, dev_state->name, dev_bytenr, 1894 block->mirror_num,
1913 block->mirror_num, 1895 btrfsic_get_block_type(state,
1914 btrfsic_get_block_type(state, block), 1896 block),
1915 block->logical_bytenr); 1897 block->logical_bytenr);
1916 else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) 1898 else
1917 printk(KERN_INFO 1899 printk(KERN_INFO
1918 "Written block @%llu (%s/%llu/%d)" 1900 "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
1919 " found in hash table, %c.\n", 1901 bytenr, dev_state->name,
1920 bytenr, dev_state->name, dev_bytenr, 1902 dev_bytenr, block->mirror_num,
1921 block->mirror_num, 1903 btrfsic_get_block_type(state,
1922 btrfsic_get_block_type(state, block)); 1904 block));
1905 }
1923 block->logical_bytenr = bytenr; 1906 block->logical_bytenr = bytenr;
1924 } else { 1907 } else {
1925 if (num_pages * PAGE_CACHE_SIZE < 1908 if (num_pages * PAGE_CACHE_SIZE <
@@ -2002,24 +1985,13 @@ again:
2002 } 1985 }
2003 } 1986 }
2004 1987
2005 if (block->is_superblock)
2006 ret = btrfsic_map_superblock(state, bytenr,
2007 processed_len,
2008 bdev, &block_ctx);
2009 else
2010 ret = btrfsic_map_block(state, bytenr, processed_len,
2011 &block_ctx, 0);
2012 if (ret) {
2013 printk(KERN_INFO
2014 "btrfsic: btrfsic_map_block(root @%llu)"
2015 " failed!\n", bytenr);
2016 goto continue_loop;
2017 }
2018 block_ctx.datav = mapped_datav;
2019 /* the following is required in case of writes to mirrors,
2020 * use the same that was used for the lookup */
2021 block_ctx.dev = dev_state; 1988 block_ctx.dev = dev_state;
2022 block_ctx.dev_bytenr = dev_bytenr; 1989 block_ctx.dev_bytenr = dev_bytenr;
1990 block_ctx.start = bytenr;
1991 block_ctx.len = processed_len;
1992 block_ctx.pagev = NULL;
1993 block_ctx.mem_to_free = NULL;
1994 block_ctx.datav = mapped_datav;
2023 1995
2024 if (is_metadata || state->include_extent_data) { 1996 if (is_metadata || state->include_extent_data) {
2025 block->never_written = 0; 1997 block->never_written = 0;
@@ -2133,10 +2105,6 @@ again:
2133 /* this is getting ugly for the 2105 /* this is getting ugly for the
2134 * include_extent_data case... */ 2106 * include_extent_data case... */
2135 bytenr = 0; /* unknown */ 2107 bytenr = 0; /* unknown */
2136 block_ctx.start = bytenr;
2137 block_ctx.len = processed_len;
2138 block_ctx.mem_to_free = NULL;
2139 block_ctx.pagev = NULL;
2140 } else { 2108 } else {
2141 processed_len = state->metablock_size; 2109 processed_len = state->metablock_size;
2142 bytenr = btrfs_stack_header_bytenr( 2110 bytenr = btrfs_stack_header_bytenr(
@@ -2149,22 +2117,15 @@ again:
2149 "Written block @%llu (%s/%llu/?)" 2117 "Written block @%llu (%s/%llu/?)"
2150 " !found in hash table, M.\n", 2118 " !found in hash table, M.\n",
2151 bytenr, dev_state->name, dev_bytenr); 2119 bytenr, dev_state->name, dev_bytenr);
2152
2153 ret = btrfsic_map_block(state, bytenr, processed_len,
2154 &block_ctx, 0);
2155 if (ret) {
2156 printk(KERN_INFO
2157 "btrfsic: btrfsic_map_block(root @%llu)"
2158 " failed!\n",
2159 dev_bytenr);
2160 goto continue_loop;
2161 }
2162 } 2120 }
2163 block_ctx.datav = mapped_datav; 2121
2164 /* the following is required in case of writes to mirrors,
2165 * use the same that was used for the lookup */
2166 block_ctx.dev = dev_state; 2122 block_ctx.dev = dev_state;
2167 block_ctx.dev_bytenr = dev_bytenr; 2123 block_ctx.dev_bytenr = dev_bytenr;
2124 block_ctx.start = bytenr;
2125 block_ctx.len = processed_len;
2126 block_ctx.pagev = NULL;
2127 block_ctx.mem_to_free = NULL;
2128 block_ctx.datav = mapped_datav;
2168 2129
2169 block = btrfsic_block_alloc(); 2130 block = btrfsic_block_alloc();
2170 if (NULL == block) { 2131 if (NULL == block) {
@@ -3130,10 +3091,13 @@ int btrfsic_mount(struct btrfs_root *root,
3130 root->sectorsize, PAGE_CACHE_SIZE); 3091 root->sectorsize, PAGE_CACHE_SIZE);
3131 return -1; 3092 return -1;
3132 } 3093 }
3133 state = kzalloc(sizeof(*state), GFP_NOFS); 3094 state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
3134 if (NULL == state) { 3095 if (!state) {
3135 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); 3096 state = vzalloc(sizeof(*state));
3136 return -1; 3097 if (!state) {
3098 printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n");
3099 return -1;
3100 }
3137 } 3101 }
3138 3102
3139 if (!btrfsic_is_initialized) { 3103 if (!btrfsic_is_initialized) {
@@ -3277,5 +3241,8 @@ void btrfsic_unmount(struct btrfs_root *root,
3277 3241
3278 mutex_unlock(&btrfsic_mutex); 3242 mutex_unlock(&btrfsic_mutex);
3279 3243
3280 kfree(state); 3244 if (is_vmalloc_addr(state))
3245 vfree(state);
3246 else
3247 kfree(state);
3281} 3248}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index dcd9be32ac57..e9df8862012c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -224,16 +224,19 @@ out:
224 * Clear the writeback bits on all of the file 224 * Clear the writeback bits on all of the file
225 * pages for a compressed write 225 * pages for a compressed write
226 */ 226 */
227static noinline void end_compressed_writeback(struct inode *inode, u64 start, 227static noinline void end_compressed_writeback(struct inode *inode,
228 unsigned long ram_size) 228 const struct compressed_bio *cb)
229{ 229{
230 unsigned long index = start >> PAGE_CACHE_SHIFT; 230 unsigned long index = cb->start >> PAGE_CACHE_SHIFT;
231 unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; 231 unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT;
232 struct page *pages[16]; 232 struct page *pages[16];
233 unsigned long nr_pages = end_index - index + 1; 233 unsigned long nr_pages = end_index - index + 1;
234 int i; 234 int i;
235 int ret; 235 int ret;
236 236
237 if (cb->errors)
238 mapping_set_error(inode->i_mapping, -EIO);
239
237 while (nr_pages > 0) { 240 while (nr_pages > 0) {
238 ret = find_get_pages_contig(inode->i_mapping, index, 241 ret = find_get_pages_contig(inode->i_mapping, index,
239 min_t(unsigned long, 242 min_t(unsigned long,
@@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start,
244 continue; 247 continue;
245 } 248 }
246 for (i = 0; i < ret; i++) { 249 for (i = 0; i < ret; i++) {
250 if (cb->errors)
251 SetPageError(pages[i]);
247 end_page_writeback(pages[i]); 252 end_page_writeback(pages[i]);
248 page_cache_release(pages[i]); 253 page_cache_release(pages[i]);
249 } 254 }
@@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int err)
287 tree->ops->writepage_end_io_hook(cb->compressed_pages[0], 292 tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
288 cb->start, 293 cb->start,
289 cb->start + cb->len - 1, 294 cb->start + cb->len - 1,
290 NULL, 1); 295 NULL,
296 err ? 0 : 1);
291 cb->compressed_pages[0]->mapping = NULL; 297 cb->compressed_pages[0]->mapping = NULL;
292 298
293 end_compressed_writeback(inode, cb->start, cb->len); 299 end_compressed_writeback(inode, cb);
294 /* note, our inode could be gone now */ 300 /* note, our inode could be gone now */
295 301
296 /* 302 /*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 150822ee0a0b..14a72ed14ef7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2929,7 +2929,7 @@ done:
2929 */ 2929 */
2930 if (!p->leave_spinning) 2930 if (!p->leave_spinning)
2931 btrfs_set_path_blocking(p); 2931 btrfs_set_path_blocking(p);
2932 if (ret < 0) 2932 if (ret < 0 && !p->skip_release_on_error)
2933 btrfs_release_path(p); 2933 btrfs_release_path(p);
2934 return ret; 2934 return ret;
2935} 2935}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fe69edda11fb..e6fbbd74b716 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -607,6 +607,7 @@ struct btrfs_path {
607 unsigned int leave_spinning:1; 607 unsigned int leave_spinning:1;
608 unsigned int search_commit_root:1; 608 unsigned int search_commit_root:1;
609 unsigned int need_commit_sem:1; 609 unsigned int need_commit_sem:1;
610 unsigned int skip_release_on_error:1;
610}; 611};
611 612
612/* 613/*
@@ -1170,6 +1171,7 @@ struct btrfs_space_info {
1170 struct percpu_counter total_bytes_pinned; 1171 struct percpu_counter total_bytes_pinned;
1171 1172
1172 struct list_head list; 1173 struct list_head list;
1174 struct list_head ro_bgs;
1173 1175
1174 struct rw_semaphore groups_sem; 1176 struct rw_semaphore groups_sem;
1175 /* for block groups in our same type */ 1177 /* for block groups in our same type */
@@ -1276,6 +1278,8 @@ struct btrfs_block_group_cache {
1276 unsigned int ro:1; 1278 unsigned int ro:1;
1277 unsigned int dirty:1; 1279 unsigned int dirty:1;
1278 unsigned int iref:1; 1280 unsigned int iref:1;
1281 unsigned int has_caching_ctl:1;
1282 unsigned int removed:1;
1279 1283
1280 int disk_cache_state; 1284 int disk_cache_state;
1281 1285
@@ -1305,6 +1309,11 @@ struct btrfs_block_group_cache {
1305 1309
1306 /* For delayed block group creation or deletion of empty block groups */ 1310 /* For delayed block group creation or deletion of empty block groups */
1307 struct list_head bg_list; 1311 struct list_head bg_list;
1312
1313 /* For read-only block groups */
1314 struct list_head ro_list;
1315
1316 atomic_t trimming;
1308}; 1317};
1309 1318
1310/* delayed seq elem */ 1319/* delayed seq elem */
@@ -1402,6 +1411,11 @@ struct btrfs_fs_info {
1402 */ 1411 */
1403 u64 last_trans_log_full_commit; 1412 u64 last_trans_log_full_commit;
1404 unsigned long mount_opt; 1413 unsigned long mount_opt;
1414 /*
1415 * Track requests for actions that need to be done during transaction
1416 * commit (like for some mount options).
1417 */
1418 unsigned long pending_changes;
1405 unsigned long compress_type:4; 1419 unsigned long compress_type:4;
1406 int commit_interval; 1420 int commit_interval;
1407 /* 1421 /*
@@ -1729,6 +1743,12 @@ struct btrfs_fs_info {
1729 1743
1730 /* For btrfs to record security options */ 1744 /* For btrfs to record security options */
1731 struct security_mnt_opts security_opts; 1745 struct security_mnt_opts security_opts;
1746
1747 /*
1748 * Chunks that can't be freed yet (under a trim/discard operation)
1749 * and will be latter freed. Protected by fs_info->chunk_mutex.
1750 */
1751 struct list_head pinned_chunks;
1732}; 1752};
1733 1753
1734struct btrfs_subvolume_writers { 1754struct btrfs_subvolume_writers {
@@ -2093,7 +2113,6 @@ struct btrfs_ioctl_defrag_range_args {
2093#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) 2113#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
2094#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) 2114#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
2095#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) 2115#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
2096#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24)
2097 2116
2098#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) 2117#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
2099#define BTRFS_DEFAULT_MAX_INLINE (8192) 2118#define BTRFS_DEFAULT_MAX_INLINE (8192)
@@ -2103,6 +2122,7 @@ struct btrfs_ioctl_defrag_range_args {
2103#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) 2122#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
2104#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ 2123#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
2105 BTRFS_MOUNT_##opt) 2124 BTRFS_MOUNT_##opt)
2125
2106#define btrfs_set_and_info(root, opt, fmt, args...) \ 2126#define btrfs_set_and_info(root, opt, fmt, args...) \
2107{ \ 2127{ \
2108 if (!btrfs_test_opt(root, opt)) \ 2128 if (!btrfs_test_opt(root, opt)) \
@@ -2118,6 +2138,49 @@ struct btrfs_ioctl_defrag_range_args {
2118} 2138}
2119 2139
2120/* 2140/*
2141 * Requests for changes that need to be done during transaction commit.
2142 *
2143 * Internal mount options that are used for special handling of the real
2144 * mount options (eg. cannot be set during remount and have to be set during
2145 * transaction commit)
2146 */
2147
2148#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0)
2149#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1)
2150#define BTRFS_PENDING_COMMIT (2)
2151
2152#define btrfs_test_pending(info, opt) \
2153 test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
2154#define btrfs_set_pending(info, opt) \
2155 set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
2156#define btrfs_clear_pending(info, opt) \
2157 clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes)
2158
2159/*
2160 * Helpers for setting pending mount option changes.
2161 *
2162 * Expects corresponding macros
2163 * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name
2164 */
2165#define btrfs_set_pending_and_info(info, opt, fmt, args...) \
2166do { \
2167 if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \
2168 btrfs_info((info), fmt, ##args); \
2169 btrfs_set_pending((info), SET_##opt); \
2170 btrfs_clear_pending((info), CLEAR_##opt); \
2171 } \
2172} while(0)
2173
2174#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \
2175do { \
2176 if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \
2177 btrfs_info((info), fmt, ##args); \
2178 btrfs_set_pending((info), CLEAR_##opt); \
2179 btrfs_clear_pending((info), SET_##opt); \
2180 } \
2181} while(0)
2182
2183/*
2121 * Inode flags 2184 * Inode flags
2122 */ 2185 */
2123#define BTRFS_INODE_NODATASUM (1 << 0) 2186#define BTRFS_INODE_NODATASUM (1 << 0)
@@ -3351,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
3351 u64 type, u64 chunk_objectid, u64 chunk_offset, 3414 u64 type, u64 chunk_objectid, u64 chunk_offset,
3352 u64 size); 3415 u64 size);
3353int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 3416int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
3354 struct btrfs_root *root, u64 group_start); 3417 struct btrfs_root *root, u64 group_start,
3418 struct extent_map *em);
3355void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); 3419void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
3356void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 3420void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
3357 struct btrfs_root *root); 3421 struct btrfs_root *root);
@@ -3427,8 +3491,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
3427int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3491int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
3428 struct btrfs_fs_info *fs_info); 3492 struct btrfs_fs_info *fs_info);
3429int __get_raid_index(u64 flags); 3493int __get_raid_index(u64 flags);
3430int btrfs_start_nocow_write(struct btrfs_root *root); 3494int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
3431void btrfs_end_nocow_write(struct btrfs_root *root); 3495void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
3432/* ctree.c */ 3496/* ctree.c */
3433int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3497int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
3434 int level, int *slot); 3498 int level, int *slot);
@@ -3686,6 +3750,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
3686int verify_dir_item(struct btrfs_root *root, 3750int verify_dir_item(struct btrfs_root *root,
3687 struct extent_buffer *leaf, 3751 struct extent_buffer *leaf,
3688 struct btrfs_dir_item *dir_item); 3752 struct btrfs_dir_item *dir_item);
3753struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
3754 struct btrfs_path *path,
3755 const char *name,
3756 int name_len);
3689 3757
3690/* orphan.c */ 3758/* orphan.c */
3691int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, 3759int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -3857,6 +3925,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
3857 struct btrfs_trans_handle *trans, int mode, 3925 struct btrfs_trans_handle *trans, int mode,
3858 u64 start, u64 num_bytes, u64 min_size, 3926 u64 start, u64 num_bytes, u64 min_size,
3859 loff_t actual_len, u64 *alloc_hint); 3927 loff_t actual_len, u64 *alloc_hint);
3928int btrfs_inode_check_errors(struct inode *inode);
3860extern const struct dentry_operations btrfs_dentry_operations; 3929extern const struct dentry_operations btrfs_dentry_operations;
3861 3930
3862/* ioctl.c */ 3931/* ioctl.c */
@@ -3901,6 +3970,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
3901 struct page **pages, size_t num_pages, 3970 struct page **pages, size_t num_pages,
3902 loff_t pos, size_t write_bytes, 3971 loff_t pos, size_t write_bytes,
3903 struct extent_state **cached); 3972 struct extent_state **cached);
3973int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
3904 3974
3905/* tree-defrag.c */ 3975/* tree-defrag.c */
3906int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 3976int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -4097,7 +4167,12 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
4097/* dev-replace.c */ 4167/* dev-replace.c */
4098void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); 4168void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
4099void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); 4169void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
4100void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info); 4170void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
4171
4172static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
4173{
4174 btrfs_bio_counter_sub(fs_info, 1);
4175}
4101 4176
4102/* reada.c */ 4177/* reada.c */
4103struct reada_control { 4178struct reada_control {
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 6f662b34ba0e..ca6a3a3b6b6c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -316,11 +316,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
316 struct btrfs_device *tgt_device = NULL; 316 struct btrfs_device *tgt_device = NULL;
317 struct btrfs_device *src_device = NULL; 317 struct btrfs_device *src_device = NULL;
318 318
319 if (btrfs_fs_incompat(fs_info, RAID56)) {
320 btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
321 return -EOPNOTSUPP;
322 }
323
324 switch (args->start.cont_reading_from_srcdev_mode) { 319 switch (args->start.cont_reading_from_srcdev_mode) {
325 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: 320 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
326 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: 321 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
@@ -422,9 +417,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
422 &dev_replace->scrub_progress, 0, 1); 417 &dev_replace->scrub_progress, 0, 1);
423 418
424 ret = btrfs_dev_replace_finishing(root->fs_info, ret); 419 ret = btrfs_dev_replace_finishing(root->fs_info, ret);
425 WARN_ON(ret); 420 /* don't warn if EINPROGRESS, someone else might be running scrub */
421 if (ret == -EINPROGRESS) {
422 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
423 ret = 0;
424 } else {
425 WARN_ON(ret);
426 }
426 427
427 return 0; 428 return ret;
428 429
429leave: 430leave:
430 dev_replace->srcdev = NULL; 431 dev_replace->srcdev = NULL;
@@ -542,7 +543,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
542 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 543 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
543 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 544 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
544 545
545 return 0; 546 return scrub_ret;
546 } 547 }
547 548
548 printk_in_rcu(KERN_INFO 549 printk_in_rcu(KERN_INFO
@@ -571,15 +572,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
571 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 572 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
572 fs_info->fs_devices->rw_devices++; 573 fs_info->fs_devices->rw_devices++;
573 574
574 /* replace the sysfs entry */
575 btrfs_kobj_rm_device(fs_info, src_device);
576 btrfs_kobj_add_device(fs_info, tgt_device);
577
578 btrfs_dev_replace_unlock(dev_replace); 575 btrfs_dev_replace_unlock(dev_replace);
579 576
580 btrfs_rm_dev_replace_blocked(fs_info); 577 btrfs_rm_dev_replace_blocked(fs_info);
581 578
582 btrfs_rm_dev_replace_srcdev(fs_info, src_device); 579 btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
583 580
584 btrfs_rm_dev_replace_unblocked(fs_info); 581 btrfs_rm_dev_replace_unblocked(fs_info);
585 582
@@ -594,6 +591,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
594 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 591 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
595 mutex_unlock(&uuid_mutex); 592 mutex_unlock(&uuid_mutex);
596 593
594 /* replace the sysfs entry */
595 btrfs_kobj_rm_device(fs_info, src_device);
596 btrfs_kobj_add_device(fs_info, tgt_device);
597 btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
598
597 /* write back the superblocks */ 599 /* write back the superblocks */
598 trans = btrfs_start_transaction(root, 0); 600 trans = btrfs_start_transaction(root, 0);
599 if (!IS_ERR(trans)) 601 if (!IS_ERR(trans))
@@ -920,9 +922,9 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
920 percpu_counter_inc(&fs_info->bio_counter); 922 percpu_counter_inc(&fs_info->bio_counter);
921} 923}
922 924
923void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) 925void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
924{ 926{
925 percpu_counter_dec(&fs_info->bio_counter); 927 percpu_counter_sub(&fs_info->bio_counter, amount);
926 928
927 if (waitqueue_active(&fs_info->replace_wait)) 929 if (waitqueue_active(&fs_info->replace_wait))
928 wake_up(&fs_info->replace_wait); 930 wake_up(&fs_info->replace_wait);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index fc8df866e919..1752625fb4dd 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,10 +21,6 @@
21#include "hash.h" 21#include "hash.h"
22#include "transaction.h" 22#include "transaction.h"
23 23
24static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
25 struct btrfs_path *path,
26 const char *name, int name_len);
27
28/* 24/*
29 * insert a name into a directory, doing overflow properly if there is a hash 25 * insert a name into a directory, doing overflow properly if there is a hash
30 * collision. data_size indicates how big the item inserted should be. On 26 * collision. data_size indicates how big the item inserted should be. On
@@ -383,9 +379,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
383 * this walks through all the entries in a dir item and finds one 379 * this walks through all the entries in a dir item and finds one
384 * for a specific name. 380 * for a specific name.
385 */ 381 */
386static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, 382struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
387 struct btrfs_path *path, 383 struct btrfs_path *path,
388 const char *name, int name_len) 384 const char *name, int name_len)
389{ 385{
390 struct btrfs_dir_item *dir_item; 386 struct btrfs_dir_item *dir_item;
391 unsigned long name_ptr; 387 unsigned long name_ptr;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1bf9f897065d..30965120772b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb,
2384 init_waitqueue_head(&fs_info->transaction_blocked_wait); 2384 init_waitqueue_head(&fs_info->transaction_blocked_wait);
2385 init_waitqueue_head(&fs_info->async_submit_wait); 2385 init_waitqueue_head(&fs_info->async_submit_wait);
2386 2386
2387 INIT_LIST_HEAD(&fs_info->pinned_chunks);
2388
2387 ret = btrfs_alloc_stripe_hash_table(fs_info); 2389 ret = btrfs_alloc_stripe_hash_table(fs_info);
2388 if (ret) { 2390 if (ret) {
2389 err = ret; 2391 err = ret;
@@ -2830,9 +2832,11 @@ retry_root_backup:
2830 btrfs_set_opt(fs_info->mount_opt, SSD); 2832 btrfs_set_opt(fs_info->mount_opt, SSD);
2831 } 2833 }
2832 2834
2833 /* Set the real inode map cache flag */ 2835 /*
2834 if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE)) 2836 * Mount does not set all options immediatelly, we can do it now and do
2835 btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE); 2837 * not have to wait for transaction commit
2838 */
2839 btrfs_apply_pending_changes(fs_info);
2836 2840
2837#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 2841#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2838 if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { 2842 if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
@@ -3713,6 +3717,17 @@ void close_ctree(struct btrfs_root *root)
3713 3717
3714 btrfs_free_block_rsv(root, root->orphan_block_rsv); 3718 btrfs_free_block_rsv(root, root->orphan_block_rsv);
3715 root->orphan_block_rsv = NULL; 3719 root->orphan_block_rsv = NULL;
3720
3721 lock_chunks(root);
3722 while (!list_empty(&fs_info->pinned_chunks)) {
3723 struct extent_map *em;
3724
3725 em = list_first_entry(&fs_info->pinned_chunks,
3726 struct extent_map, list);
3727 list_del_init(&em->list);
3728 free_extent_map(em);
3729 }
3730 unlock_chunks(root);
3716} 3731}
3717 3732
3718int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 3733int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -3839,12 +3854,12 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3839 */ 3854 */
3840 if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) 3855 if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
3841 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", 3856 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3842 sb->root); 3857 btrfs_super_root(sb));
3843 if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) 3858 if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
3844 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", 3859 printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
3845 sb->chunk_root); 3860 btrfs_super_chunk_root(sb));
3846 if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) 3861 if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
3847 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", 3862 printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
3848 btrfs_super_log_root(sb)); 3863 btrfs_super_log_root(sb));
3849 3864
3850 if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { 3865 if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
@@ -4129,6 +4144,25 @@ again:
4129 return 0; 4144 return 0;
4130} 4145}
4131 4146
4147static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
4148 struct btrfs_fs_info *fs_info)
4149{
4150 struct btrfs_ordered_extent *ordered;
4151
4152 spin_lock(&fs_info->trans_lock);
4153 while (!list_empty(&cur_trans->pending_ordered)) {
4154 ordered = list_first_entry(&cur_trans->pending_ordered,
4155 struct btrfs_ordered_extent,
4156 trans_list);
4157 list_del_init(&ordered->trans_list);
4158 spin_unlock(&fs_info->trans_lock);
4159
4160 btrfs_put_ordered_extent(ordered);
4161 spin_lock(&fs_info->trans_lock);
4162 }
4163 spin_unlock(&fs_info->trans_lock);
4164}
4165
4132void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, 4166void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4133 struct btrfs_root *root) 4167 struct btrfs_root *root)
4134{ 4168{
@@ -4140,6 +4174,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4140 cur_trans->state = TRANS_STATE_UNBLOCKED; 4174 cur_trans->state = TRANS_STATE_UNBLOCKED;
4141 wake_up(&root->fs_info->transaction_wait); 4175 wake_up(&root->fs_info->transaction_wait);
4142 4176
4177 btrfs_free_pending_ordered(cur_trans, root->fs_info);
4143 btrfs_destroy_delayed_inodes(root); 4178 btrfs_destroy_delayed_inodes(root);
4144 btrfs_assert_delayed_root_empty(root); 4179 btrfs_assert_delayed_root_empty(root);
4145 4180
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 47c1ba141082..222d6aea4a8a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache)
315 struct btrfs_caching_control *ctl; 315 struct btrfs_caching_control *ctl;
316 316
317 spin_lock(&cache->lock); 317 spin_lock(&cache->lock);
318 if (cache->cached != BTRFS_CACHE_STARTED) {
319 spin_unlock(&cache->lock);
320 return NULL;
321 }
322
323 /* We're loading it the fast way, so we don't have a caching_ctl. */
324 if (!cache->caching_ctl) { 318 if (!cache->caching_ctl) {
325 spin_unlock(&cache->lock); 319 spin_unlock(&cache->lock);
326 return NULL; 320 return NULL;
@@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
594 spin_unlock(&cache->lock); 588 spin_unlock(&cache->lock);
595 589
596 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 590 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
591 mutex_lock(&caching_ctl->mutex);
597 ret = load_free_space_cache(fs_info, cache); 592 ret = load_free_space_cache(fs_info, cache);
598 593
599 spin_lock(&cache->lock); 594 spin_lock(&cache->lock);
@@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
601 cache->caching_ctl = NULL; 596 cache->caching_ctl = NULL;
602 cache->cached = BTRFS_CACHE_FINISHED; 597 cache->cached = BTRFS_CACHE_FINISHED;
603 cache->last_byte_to_unpin = (u64)-1; 598 cache->last_byte_to_unpin = (u64)-1;
599 caching_ctl->progress = (u64)-1;
604 } else { 600 } else {
605 if (load_cache_only) { 601 if (load_cache_only) {
606 cache->caching_ctl = NULL; 602 cache->caching_ctl = NULL;
607 cache->cached = BTRFS_CACHE_NO; 603 cache->cached = BTRFS_CACHE_NO;
608 } else { 604 } else {
609 cache->cached = BTRFS_CACHE_STARTED; 605 cache->cached = BTRFS_CACHE_STARTED;
606 cache->has_caching_ctl = 1;
610 } 607 }
611 } 608 }
612 spin_unlock(&cache->lock); 609 spin_unlock(&cache->lock);
610 mutex_unlock(&caching_ctl->mutex);
611
613 wake_up(&caching_ctl->wait); 612 wake_up(&caching_ctl->wait);
614 if (ret == 1) { 613 if (ret == 1) {
615 put_caching_control(caching_ctl); 614 put_caching_control(caching_ctl);
@@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
627 cache->cached = BTRFS_CACHE_NO; 626 cache->cached = BTRFS_CACHE_NO;
628 } else { 627 } else {
629 cache->cached = BTRFS_CACHE_STARTED; 628 cache->cached = BTRFS_CACHE_STARTED;
629 cache->has_caching_ctl = 1;
630 } 630 }
631 spin_unlock(&cache->lock); 631 spin_unlock(&cache->lock);
632 wake_up(&caching_ctl->wait); 632 wake_up(&caching_ctl->wait);
@@ -3162,7 +3162,19 @@ next_block_group(struct btrfs_root *root,
3162 struct btrfs_block_group_cache *cache) 3162 struct btrfs_block_group_cache *cache)
3163{ 3163{
3164 struct rb_node *node; 3164 struct rb_node *node;
3165
3165 spin_lock(&root->fs_info->block_group_cache_lock); 3166 spin_lock(&root->fs_info->block_group_cache_lock);
3167
3168 /* If our block group was removed, we need a full search. */
3169 if (RB_EMPTY_NODE(&cache->cache_node)) {
3170 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3171
3172 spin_unlock(&root->fs_info->block_group_cache_lock);
3173 btrfs_put_block_group(cache);
3174 cache = btrfs_lookup_first_block_group(root->fs_info,
3175 next_bytenr);
3176 return cache;
3177 }
3166 node = rb_next(&cache->cache_node); 3178 node = rb_next(&cache->cache_node);
3167 btrfs_put_block_group(cache); 3179 btrfs_put_block_group(cache);
3168 if (node) { 3180 if (node) {
@@ -3504,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3504 found->chunk_alloc = 0; 3516 found->chunk_alloc = 0;
3505 found->flush = 0; 3517 found->flush = 0;
3506 init_waitqueue_head(&found->wait); 3518 init_waitqueue_head(&found->wait);
3519 INIT_LIST_HEAD(&found->ro_bgs);
3507 3520
3508 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3521 ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3509 info->space_info_kobj, "%s", 3522 info->space_info_kobj, "%s",
@@ -5425,7 +5438,17 @@ static int update_block_group(struct btrfs_root *root,
5425 spin_unlock(&cache->space_info->lock); 5438 spin_unlock(&cache->space_info->lock);
5426 } else { 5439 } else {
5427 old_val -= num_bytes; 5440 old_val -= num_bytes;
5441 btrfs_set_block_group_used(&cache->item, old_val);
5442 cache->pinned += num_bytes;
5443 cache->space_info->bytes_pinned += num_bytes;
5444 cache->space_info->bytes_used -= num_bytes;
5445 cache->space_info->disk_used -= num_bytes * factor;
5446 spin_unlock(&cache->lock);
5447 spin_unlock(&cache->space_info->lock);
5428 5448
5449 set_extent_dirty(info->pinned_extents,
5450 bytenr, bytenr + num_bytes - 1,
5451 GFP_NOFS | __GFP_NOFAIL);
5429 /* 5452 /*
5430 * No longer have used bytes in this block group, queue 5453 * No longer have used bytes in this block group, queue
5431 * it for deletion. 5454 * it for deletion.
@@ -5439,17 +5462,6 @@ static int update_block_group(struct btrfs_root *root,
5439 } 5462 }
5440 spin_unlock(&info->unused_bgs_lock); 5463 spin_unlock(&info->unused_bgs_lock);
5441 } 5464 }
5442 btrfs_set_block_group_used(&cache->item, old_val);
5443 cache->pinned += num_bytes;
5444 cache->space_info->bytes_pinned += num_bytes;
5445 cache->space_info->bytes_used -= num_bytes;
5446 cache->space_info->disk_used -= num_bytes * factor;
5447 spin_unlock(&cache->lock);
5448 spin_unlock(&cache->space_info->lock);
5449
5450 set_extent_dirty(info->pinned_extents,
5451 bytenr, bytenr + num_bytes - 1,
5452 GFP_NOFS | __GFP_NOFAIL);
5453 } 5465 }
5454 btrfs_put_block_group(cache); 5466 btrfs_put_block_group(cache);
5455 total -= num_bytes; 5467 total -= num_bytes;
@@ -8511,6 +8523,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
8511 min_allocable_bytes <= sinfo->total_bytes) { 8523 min_allocable_bytes <= sinfo->total_bytes) {
8512 sinfo->bytes_readonly += num_bytes; 8524 sinfo->bytes_readonly += num_bytes;
8513 cache->ro = 1; 8525 cache->ro = 1;
8526 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
8514 ret = 0; 8527 ret = 0;
8515 } 8528 }
8516out: 8529out:
@@ -8565,15 +8578,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
8565 8578
8566/* 8579/*
8567 * helper to account the unused space of all the readonly block group in the 8580 * helper to account the unused space of all the readonly block group in the
8568 * list. takes mirrors into account. 8581 * space_info. takes mirrors into account.
8569 */ 8582 */
8570static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) 8583u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8571{ 8584{
8572 struct btrfs_block_group_cache *block_group; 8585 struct btrfs_block_group_cache *block_group;
8573 u64 free_bytes = 0; 8586 u64 free_bytes = 0;
8574 int factor; 8587 int factor;
8575 8588
8576 list_for_each_entry(block_group, groups_list, list) { 8589 /* It's df, we don't care if it's racey */
8590 if (list_empty(&sinfo->ro_bgs))
8591 return 0;
8592
8593 spin_lock(&sinfo->lock);
8594 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
8577 spin_lock(&block_group->lock); 8595 spin_lock(&block_group->lock);
8578 8596
8579 if (!block_group->ro) { 8597 if (!block_group->ro) {
@@ -8594,26 +8612,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
8594 8612
8595 spin_unlock(&block_group->lock); 8613 spin_unlock(&block_group->lock);
8596 } 8614 }
8597
8598 return free_bytes;
8599}
8600
8601/*
8602 * helper to account the unused space of all the readonly block group in the
8603 * space_info. takes mirrors into account.
8604 */
8605u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8606{
8607 int i;
8608 u64 free_bytes = 0;
8609
8610 spin_lock(&sinfo->lock);
8611
8612 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
8613 if (!list_empty(&sinfo->block_groups[i]))
8614 free_bytes += __btrfs_get_ro_block_group_free_space(
8615 &sinfo->block_groups[i]);
8616
8617 spin_unlock(&sinfo->lock); 8615 spin_unlock(&sinfo->lock);
8618 8616
8619 return free_bytes; 8617 return free_bytes;
@@ -8633,6 +8631,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
8633 cache->bytes_super - btrfs_block_group_used(&cache->item); 8631 cache->bytes_super - btrfs_block_group_used(&cache->item);
8634 sinfo->bytes_readonly -= num_bytes; 8632 sinfo->bytes_readonly -= num_bytes;
8635 cache->ro = 0; 8633 cache->ro = 0;
8634 list_del_init(&cache->ro_list);
8636 spin_unlock(&cache->lock); 8635 spin_unlock(&cache->lock);
8637 spin_unlock(&sinfo->lock); 8636 spin_unlock(&sinfo->lock);
8638} 8637}
@@ -9002,7 +9001,9 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
9002 INIT_LIST_HEAD(&cache->list); 9001 INIT_LIST_HEAD(&cache->list);
9003 INIT_LIST_HEAD(&cache->cluster_list); 9002 INIT_LIST_HEAD(&cache->cluster_list);
9004 INIT_LIST_HEAD(&cache->bg_list); 9003 INIT_LIST_HEAD(&cache->bg_list);
9004 INIT_LIST_HEAD(&cache->ro_list);
9005 btrfs_init_free_space_ctl(cache); 9005 btrfs_init_free_space_ctl(cache);
9006 atomic_set(&cache->trimming, 0);
9006 9007
9007 return cache; 9008 return cache;
9008} 9009}
@@ -9195,9 +9196,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9195 int ret = 0; 9196 int ret = 0;
9196 9197
9197 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { 9198 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
9198 list_del_init(&block_group->bg_list);
9199 if (ret) 9199 if (ret)
9200 continue; 9200 goto next;
9201 9201
9202 spin_lock(&block_group->lock); 9202 spin_lock(&block_group->lock);
9203 memcpy(&item, &block_group->item, sizeof(item)); 9203 memcpy(&item, &block_group->item, sizeof(item));
@@ -9212,6 +9212,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9212 key.objectid, key.offset); 9212 key.objectid, key.offset);
9213 if (ret) 9213 if (ret)
9214 btrfs_abort_transaction(trans, extent_root, ret); 9214 btrfs_abort_transaction(trans, extent_root, ret);
9215next:
9216 list_del_init(&block_group->bg_list);
9215 } 9217 }
9216} 9218}
9217 9219
@@ -9304,7 +9306,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
9304} 9306}
9305 9307
9306int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 9308int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9307 struct btrfs_root *root, u64 group_start) 9309 struct btrfs_root *root, u64 group_start,
9310 struct extent_map *em)
9308{ 9311{
9309 struct btrfs_path *path; 9312 struct btrfs_path *path;
9310 struct btrfs_block_group_cache *block_group; 9313 struct btrfs_block_group_cache *block_group;
@@ -9316,6 +9319,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9316 int ret; 9319 int ret;
9317 int index; 9320 int index;
9318 int factor; 9321 int factor;
9322 struct btrfs_caching_control *caching_ctl = NULL;
9323 bool remove_em;
9319 9324
9320 root = root->fs_info->extent_root; 9325 root = root->fs_info->extent_root;
9321 9326
@@ -9400,6 +9405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9400 spin_lock(&root->fs_info->block_group_cache_lock); 9405 spin_lock(&root->fs_info->block_group_cache_lock);
9401 rb_erase(&block_group->cache_node, 9406 rb_erase(&block_group->cache_node,
9402 &root->fs_info->block_group_cache_tree); 9407 &root->fs_info->block_group_cache_tree);
9408 RB_CLEAR_NODE(&block_group->cache_node);
9403 9409
9404 if (root->fs_info->first_logical_byte == block_group->key.objectid) 9410 if (root->fs_info->first_logical_byte == block_group->key.objectid)
9405 root->fs_info->first_logical_byte = (u64)-1; 9411 root->fs_info->first_logical_byte = (u64)-1;
@@ -9411,6 +9417,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9411 * are still on the list after taking the semaphore 9417 * are still on the list after taking the semaphore
9412 */ 9418 */
9413 list_del_init(&block_group->list); 9419 list_del_init(&block_group->list);
9420 list_del_init(&block_group->ro_list);
9414 if (list_empty(&block_group->space_info->block_groups[index])) { 9421 if (list_empty(&block_group->space_info->block_groups[index])) {
9415 kobj = block_group->space_info->block_group_kobjs[index]; 9422 kobj = block_group->space_info->block_group_kobjs[index];
9416 block_group->space_info->block_group_kobjs[index] = NULL; 9423 block_group->space_info->block_group_kobjs[index] = NULL;
@@ -9422,8 +9429,32 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9422 kobject_put(kobj); 9429 kobject_put(kobj);
9423 } 9430 }
9424 9431
9432 if (block_group->has_caching_ctl)
9433 caching_ctl = get_caching_control(block_group);
9425 if (block_group->cached == BTRFS_CACHE_STARTED) 9434 if (block_group->cached == BTRFS_CACHE_STARTED)
9426 wait_block_group_cache_done(block_group); 9435 wait_block_group_cache_done(block_group);
9436 if (block_group->has_caching_ctl) {
9437 down_write(&root->fs_info->commit_root_sem);
9438 if (!caching_ctl) {
9439 struct btrfs_caching_control *ctl;
9440
9441 list_for_each_entry(ctl,
9442 &root->fs_info->caching_block_groups, list)
9443 if (ctl->block_group == block_group) {
9444 caching_ctl = ctl;
9445 atomic_inc(&caching_ctl->count);
9446 break;
9447 }
9448 }
9449 if (caching_ctl)
9450 list_del_init(&caching_ctl->list);
9451 up_write(&root->fs_info->commit_root_sem);
9452 if (caching_ctl) {
9453 /* Once for the caching bgs list and once for us. */
9454 put_caching_control(caching_ctl);
9455 put_caching_control(caching_ctl);
9456 }
9457 }
9427 9458
9428 btrfs_remove_free_space_cache(block_group); 9459 btrfs_remove_free_space_cache(block_group);
9429 9460
@@ -9435,6 +9466,71 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9435 9466
9436 memcpy(&key, &block_group->key, sizeof(key)); 9467 memcpy(&key, &block_group->key, sizeof(key));
9437 9468
9469 lock_chunks(root);
9470 if (!list_empty(&em->list)) {
9471 /* We're in the transaction->pending_chunks list. */
9472 free_extent_map(em);
9473 }
9474 spin_lock(&block_group->lock);
9475 block_group->removed = 1;
9476 /*
9477 * At this point trimming can't start on this block group, because we
9478 * removed the block group from the tree fs_info->block_group_cache_tree
9479 * so no one can't find it anymore and even if someone already got this
9480 * block group before we removed it from the rbtree, they have already
9481 * incremented block_group->trimming - if they didn't, they won't find
9482 * any free space entries because we already removed them all when we
9483 * called btrfs_remove_free_space_cache().
9484 *
9485 * And we must not remove the extent map from the fs_info->mapping_tree
9486 * to prevent the same logical address range and physical device space
9487 * ranges from being reused for a new block group. This is because our
9488 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
9489 * completely transactionless, so while it is trimming a range the
9490 * currently running transaction might finish and a new one start,
9491 * allowing for new block groups to be created that can reuse the same
9492 * physical device locations unless we take this special care.
9493 */
9494 remove_em = (atomic_read(&block_group->trimming) == 0);
9495 /*
9496 * Make sure a trimmer task always sees the em in the pinned_chunks list
9497 * if it sees block_group->removed == 1 (needs to lock block_group->lock
9498 * before checking block_group->removed).
9499 */
9500 if (!remove_em) {
9501 /*
9502 * Our em might be in trans->transaction->pending_chunks which
9503 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
9504 * and so is the fs_info->pinned_chunks list.
9505 *
9506 * So at this point we must be holding the chunk_mutex to avoid
9507 * any races with chunk allocation (more specifically at
9508 * volumes.c:contains_pending_extent()), to ensure it always
9509 * sees the em, either in the pending_chunks list or in the
9510 * pinned_chunks list.
9511 */
9512 list_move_tail(&em->list, &root->fs_info->pinned_chunks);
9513 }
9514 spin_unlock(&block_group->lock);
9515
9516 if (remove_em) {
9517 struct extent_map_tree *em_tree;
9518
9519 em_tree = &root->fs_info->mapping_tree.map_tree;
9520 write_lock(&em_tree->lock);
9521 /*
9522 * The em might be in the pending_chunks list, so make sure the
9523 * chunk mutex is locked, since remove_extent_mapping() will
9524 * delete us from that list.
9525 */
9526 remove_extent_mapping(em_tree, em);
9527 write_unlock(&em_tree->lock);
9528 /* once for the tree */
9529 free_extent_map(em);
9530 }
9531
9532 unlock_chunks(root);
9533
9438 btrfs_put_block_group(block_group); 9534 btrfs_put_block_group(block_group);
9439 btrfs_put_block_group(block_group); 9535 btrfs_put_block_group(block_group);
9440 9536
@@ -9523,10 +9619,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9523 */ 9619 */
9524 start = block_group->key.objectid; 9620 start = block_group->key.objectid;
9525 end = start + block_group->key.offset - 1; 9621 end = start + block_group->key.offset - 1;
9526 clear_extent_bits(&fs_info->freed_extents[0], start, end, 9622 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
9527 EXTENT_DIRTY, GFP_NOFS); 9623 EXTENT_DIRTY, GFP_NOFS);
9528 clear_extent_bits(&fs_info->freed_extents[1], start, end, 9624 if (ret) {
9625 btrfs_set_block_group_rw(root, block_group);
9626 goto end_trans;
9627 }
9628 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
9529 EXTENT_DIRTY, GFP_NOFS); 9629 EXTENT_DIRTY, GFP_NOFS);
9630 if (ret) {
9631 btrfs_set_block_group_rw(root, block_group);
9632 goto end_trans;
9633 }
9530 9634
9531 /* Reset pinned so btrfs_put_block_group doesn't complain */ 9635 /* Reset pinned so btrfs_put_block_group doesn't complain */
9532 block_group->pinned = 0; 9636 block_group->pinned = 0;
@@ -9537,6 +9641,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9537 */ 9641 */
9538 ret = btrfs_remove_chunk(trans, root, 9642 ret = btrfs_remove_chunk(trans, root,
9539 block_group->key.objectid); 9643 block_group->key.objectid);
9644end_trans:
9540 btrfs_end_transaction(trans, root); 9645 btrfs_end_transaction(trans, root);
9541next: 9646next:
9542 btrfs_put_block_group(block_group); 9647 btrfs_put_block_group(block_group);
@@ -9657,12 +9762,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
9657} 9762}
9658 9763
9659/* 9764/*
9660 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), 9765 * btrfs_{start,end}_write_no_snapshoting() are similar to
9661 * they are used to prevent the some tasks writing data into the page cache 9766 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
9662 * by nocow before the subvolume is snapshoted, but flush the data into 9767 * data into the page cache through nocow before the subvolume is snapshoted,
9663 * the disk after the snapshot creation. 9768 * but flush the data into disk after the snapshot creation, or to prevent
9769 * operations while snapshoting is ongoing and that cause the snapshot to be
9770 * inconsistent (writes followed by expanding truncates for example).
9664 */ 9771 */
9665void btrfs_end_nocow_write(struct btrfs_root *root) 9772void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
9666{ 9773{
9667 percpu_counter_dec(&root->subv_writers->counter); 9774 percpu_counter_dec(&root->subv_writers->counter);
9668 /* 9775 /*
@@ -9674,7 +9781,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
9674 wake_up(&root->subv_writers->wait); 9781 wake_up(&root->subv_writers->wait);
9675} 9782}
9676 9783
9677int btrfs_start_nocow_write(struct btrfs_root *root) 9784int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
9678{ 9785{
9679 if (atomic_read(&root->will_be_snapshoted)) 9786 if (atomic_read(&root->will_be_snapshoted))
9680 return 0; 9787 return 0;
@@ -9685,7 +9792,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
9685 */ 9792 */
9686 smp_mb(); 9793 smp_mb();
9687 if (atomic_read(&root->will_be_snapshoted)) { 9794 if (atomic_read(&root->will_be_snapshoted)) {
9688 btrfs_end_nocow_write(root); 9795 btrfs_end_write_no_snapshoting(root);
9689 return 0; 9796 return 0;
9690 } 9797 }
9691 return 1; 9798 return 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bf3f424e0013..4ebabd237153 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
595 clear = 1; 595 clear = 1;
596again: 596again:
597 if (!prealloc && (mask & __GFP_WAIT)) { 597 if (!prealloc && (mask & __GFP_WAIT)) {
598 /*
599 * Don't care for allocation failure here because we might end
600 * up not needing the pre-allocated extent state at all, which
601 * is the case if we only have in the tree extent states that
602 * cover our input range and don't cover too any other range.
603 * If we end up needing a new extent state we allocate it later.
604 */
598 prealloc = alloc_extent_state(mask); 605 prealloc = alloc_extent_state(mask);
599 if (!prealloc)
600 return -ENOMEM;
601 } 606 }
602 607
603 spin_lock(&tree->lock); 608 spin_lock(&tree->lock);
@@ -796,17 +801,25 @@ static void set_state_bits(struct extent_io_tree *tree,
796 state->state |= bits_to_set; 801 state->state |= bits_to_set;
797} 802}
798 803
799static void cache_state(struct extent_state *state, 804static void cache_state_if_flags(struct extent_state *state,
800 struct extent_state **cached_ptr) 805 struct extent_state **cached_ptr,
806 const u64 flags)
801{ 807{
802 if (cached_ptr && !(*cached_ptr)) { 808 if (cached_ptr && !(*cached_ptr)) {
803 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 809 if (!flags || (state->state & flags)) {
804 *cached_ptr = state; 810 *cached_ptr = state;
805 atomic_inc(&state->refs); 811 atomic_inc(&state->refs);
806 } 812 }
807 } 813 }
808} 814}
809 815
816static void cache_state(struct extent_state *state,
817 struct extent_state **cached_ptr)
818{
819 return cache_state_if_flags(state, cached_ptr,
820 EXTENT_IOBITS | EXTENT_BOUNDARY);
821}
822
810/* 823/*
811 * set some bits on a range in the tree. This may require allocations or 824 * set some bits on a range in the tree. This may require allocations or
812 * sleeping, so the gfp mask is used to indicate what is allowed. 825 * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -1058,13 +1071,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1058 int err = 0; 1071 int err = 0;
1059 u64 last_start; 1072 u64 last_start;
1060 u64 last_end; 1073 u64 last_end;
1074 bool first_iteration = true;
1061 1075
1062 btrfs_debug_check_extent_io_range(tree, start, end); 1076 btrfs_debug_check_extent_io_range(tree, start, end);
1063 1077
1064again: 1078again:
1065 if (!prealloc && (mask & __GFP_WAIT)) { 1079 if (!prealloc && (mask & __GFP_WAIT)) {
1080 /*
1081 * Best effort, don't worry if extent state allocation fails
1082 * here for the first iteration. We might have a cached state
1083 * that matches exactly the target range, in which case no
1084 * extent state allocations are needed. We'll only know this
1085 * after locking the tree.
1086 */
1066 prealloc = alloc_extent_state(mask); 1087 prealloc = alloc_extent_state(mask);
1067 if (!prealloc) 1088 if (!prealloc && !first_iteration)
1068 return -ENOMEM; 1089 return -ENOMEM;
1069 } 1090 }
1070 1091
@@ -1234,6 +1255,7 @@ search_again:
1234 spin_unlock(&tree->lock); 1255 spin_unlock(&tree->lock);
1235 if (mask & __GFP_WAIT) 1256 if (mask & __GFP_WAIT)
1236 cond_resched(); 1257 cond_resched();
1258 first_iteration = false;
1237 goto again; 1259 goto again;
1238} 1260}
1239 1261
@@ -1482,7 +1504,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1482 state = find_first_extent_bit_state(tree, start, bits); 1504 state = find_first_extent_bit_state(tree, start, bits);
1483got_it: 1505got_it:
1484 if (state) { 1506 if (state) {
1485 cache_state(state, cached_state); 1507 cache_state_if_flags(state, cached_state, 0);
1486 *start_ret = state->start; 1508 *start_ret = state->start;
1487 *end_ret = state->end; 1509 *end_ret = state->end;
1488 ret = 0; 1510 ret = 0;
@@ -1746,6 +1768,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
1746 if (page_ops == 0) 1768 if (page_ops == 0)
1747 return 0; 1769 return 0;
1748 1770
1771 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1772 mapping_set_error(inode->i_mapping, -EIO);
1773
1749 while (nr_pages > 0) { 1774 while (nr_pages > 0) {
1750 ret = find_get_pages_contig(inode->i_mapping, index, 1775 ret = find_get_pages_contig(inode->i_mapping, index,
1751 min_t(unsigned long, 1776 min_t(unsigned long,
@@ -1763,6 +1788,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
1763 clear_page_dirty_for_io(pages[i]); 1788 clear_page_dirty_for_io(pages[i]);
1764 if (page_ops & PAGE_SET_WRITEBACK) 1789 if (page_ops & PAGE_SET_WRITEBACK)
1765 set_page_writeback(pages[i]); 1790 set_page_writeback(pages[i]);
1791 if (page_ops & PAGE_SET_ERROR)
1792 SetPageError(pages[i]);
1766 if (page_ops & PAGE_END_WRITEBACK) 1793 if (page_ops & PAGE_END_WRITEBACK)
1767 end_page_writeback(pages[i]); 1794 end_page_writeback(pages[i]);
1768 if (page_ops & PAGE_UNLOCK) 1795 if (page_ops & PAGE_UNLOCK)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6d4b938be986..ece9ce87edff 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -49,6 +49,7 @@
49#define PAGE_SET_WRITEBACK (1 << 2) 49#define PAGE_SET_WRITEBACK (1 << 2)
50#define PAGE_END_WRITEBACK (1 << 3) 50#define PAGE_END_WRITEBACK (1 << 3)
51#define PAGE_SET_PRIVATE2 (1 << 4) 51#define PAGE_SET_PRIVATE2 (1 << 4)
52#define PAGE_SET_ERROR (1 << 5)
52 53
53/* 54/*
54 * page->private values. Every page that is controlled by the extent 55 * page->private values. Every page that is controlled by the extent
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 225302b39afb..6a98bddd8f33 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -287,8 +287,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
287 if (!em) 287 if (!em)
288 goto out; 288 goto out;
289 289
290 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
291 list_move(&em->list, &tree->modified_extents);
292 em->generation = gen; 290 em->generation = gen;
293 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 291 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
294 em->mod_start = em->start; 292 em->mod_start = em->start;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a18ceabd99a8..e4090259569b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1428 u64 num_bytes; 1428 u64 num_bytes;
1429 int ret; 1429 int ret;
1430 1430
1431 ret = btrfs_start_nocow_write(root); 1431 ret = btrfs_start_write_no_snapshoting(root);
1432 if (!ret) 1432 if (!ret)
1433 return -ENOSPC; 1433 return -ENOSPC;
1434 1434
@@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1451 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); 1451 ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
1452 if (ret <= 0) { 1452 if (ret <= 0) {
1453 ret = 0; 1453 ret = 0;
1454 btrfs_end_nocow_write(root); 1454 btrfs_end_write_no_snapshoting(root);
1455 } else { 1455 } else {
1456 *write_bytes = min_t(size_t, *write_bytes , 1456 *write_bytes = min_t(size_t, *write_bytes ,
1457 num_bytes - pos + lockstart); 1457 num_bytes - pos + lockstart);
@@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1543 btrfs_free_reserved_data_space(inode, 1543 btrfs_free_reserved_data_space(inode,
1544 reserve_bytes); 1544 reserve_bytes);
1545 else 1545 else
1546 btrfs_end_nocow_write(root); 1546 btrfs_end_write_no_snapshoting(root);
1547 break; 1547 break;
1548 } 1548 }
1549 1549
@@ -1632,7 +1632,7 @@ again:
1632 1632
1633 release_bytes = 0; 1633 release_bytes = 0;
1634 if (only_release_metadata) 1634 if (only_release_metadata)
1635 btrfs_end_nocow_write(root); 1635 btrfs_end_write_no_snapshoting(root);
1636 1636
1637 if (only_release_metadata && copied > 0) { 1637 if (only_release_metadata && copied > 0) {
1638 u64 lockstart = round_down(pos, root->sectorsize); 1638 u64 lockstart = round_down(pos, root->sectorsize);
@@ -1661,7 +1661,7 @@ again:
1661 1661
1662 if (release_bytes) { 1662 if (release_bytes) {
1663 if (only_release_metadata) { 1663 if (only_release_metadata) {
1664 btrfs_end_nocow_write(root); 1664 btrfs_end_write_no_snapshoting(root);
1665 btrfs_delalloc_release_metadata(inode, release_bytes); 1665 btrfs_delalloc_release_metadata(inode, release_bytes);
1666 } else { 1666 } else {
1667 btrfs_delalloc_release_space(inode, release_bytes); 1667 btrfs_delalloc_release_space(inode, release_bytes);
@@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1676 loff_t pos) 1676 loff_t pos)
1677{ 1677{
1678 struct file *file = iocb->ki_filp; 1678 struct file *file = iocb->ki_filp;
1679 struct inode *inode = file_inode(file);
1679 ssize_t written; 1680 ssize_t written;
1680 ssize_t written_buffered; 1681 ssize_t written_buffered;
1681 loff_t endbyte; 1682 loff_t endbyte;
@@ -1692,8 +1693,15 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1692 err = written_buffered; 1693 err = written_buffered;
1693 goto out; 1694 goto out;
1694 } 1695 }
1696 /*
1697 * Ensure all data is persisted. We want the next direct IO read to be
1698 * able to read what was just written.
1699 */
1695 endbyte = pos + written_buffered - 1; 1700 endbyte = pos + written_buffered - 1;
1696 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); 1701 err = btrfs_fdatawrite_range(inode, pos, endbyte);
1702 if (err)
1703 goto out;
1704 err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1697 if (err) 1705 if (err)
1698 goto out; 1706 goto out;
1699 written += written_buffered; 1707 written += written_buffered;
@@ -1854,10 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1854 int ret; 1862 int ret;
1855 1863
1856 atomic_inc(&BTRFS_I(inode)->sync_writers); 1864 atomic_inc(&BTRFS_I(inode)->sync_writers);
1857 ret = filemap_fdatawrite_range(inode->i_mapping, start, end); 1865 ret = btrfs_fdatawrite_range(inode, start, end);
1858 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1859 &BTRFS_I(inode)->runtime_flags))
1860 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1861 atomic_dec(&BTRFS_I(inode)->sync_writers); 1866 atomic_dec(&BTRFS_I(inode)->sync_writers);
1862 1867
1863 return ret; 1868 return ret;
@@ -2810,3 +2815,29 @@ int btrfs_auto_defrag_init(void)
2810 2815
2811 return 0; 2816 return 0;
2812} 2817}
2818
2819int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
2820{
2821 int ret;
2822
2823 /*
2824 * So with compression we will find and lock a dirty page and clear the
2825 * first one as dirty, setup an async extent, and immediately return
2826 * with the entire range locked but with nobody actually marked with
2827 * writeback. So we can't just filemap_write_and_wait_range() and
2828 * expect it to work since it will just kick off a thread to do the
2829 * actual work. So we need to call filemap_fdatawrite_range _again_
2830 * since it will wait on the page lock, which won't be unlocked until
2831 * after the pages have been marked as writeback and so we're good to go
2832 * from there. We have to do this otherwise we'll miss the ordered
2833 * extents and that results in badness. Please Josef, do not think you
2834 * know better and pull this out at some point in the future, it is
2835 * right and you are wrong.
2836 */
2837 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
2838 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
2839 &BTRFS_I(inode)->runtime_flags))
2840 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
2841
2842 return ret;
2843}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 33848196550e..030847bf7cec 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -27,10 +27,17 @@
27#include "disk-io.h" 27#include "disk-io.h"
28#include "extent_io.h" 28#include "extent_io.h"
29#include "inode-map.h" 29#include "inode-map.h"
30#include "volumes.h"
30 31
31#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 32#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
32#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 33#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
33 34
35struct btrfs_trim_range {
36 u64 start;
37 u64 bytes;
38 struct list_head list;
39};
40
34static int link_free_space(struct btrfs_free_space_ctl *ctl, 41static int link_free_space(struct btrfs_free_space_ctl *ctl,
35 struct btrfs_free_space *info); 42 struct btrfs_free_space *info);
36static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 43static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
@@ -881,6 +888,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
881 int ret; 888 int ret;
882 struct btrfs_free_cluster *cluster = NULL; 889 struct btrfs_free_cluster *cluster = NULL;
883 struct rb_node *node = rb_first(&ctl->free_space_offset); 890 struct rb_node *node = rb_first(&ctl->free_space_offset);
891 struct btrfs_trim_range *trim_entry;
884 892
885 /* Get the cluster for this block_group if it exists */ 893 /* Get the cluster for this block_group if it exists */
886 if (block_group && !list_empty(&block_group->cluster_list)) { 894 if (block_group && !list_empty(&block_group->cluster_list)) {
@@ -916,6 +924,21 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
916 cluster = NULL; 924 cluster = NULL;
917 } 925 }
918 } 926 }
927
928 /*
929 * Make sure we don't miss any range that was removed from our rbtree
930 * because trimming is running. Otherwise after a umount+mount (or crash
931 * after committing the transaction) we would leak free space and get
932 * an inconsistent free space cache report from fsck.
933 */
934 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
935 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
936 trim_entry->bytes, NULL);
937 if (ret)
938 goto fail;
939 *entries += 1;
940 }
941
919 return 0; 942 return 0;
920fail: 943fail:
921 return -ENOSPC; 944 return -ENOSPC;
@@ -1135,12 +1158,15 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1135 1158
1136 io_ctl_set_generation(&io_ctl, trans->transid); 1159 io_ctl_set_generation(&io_ctl, trans->transid);
1137 1160
1161 mutex_lock(&ctl->cache_writeout_mutex);
1138 /* Write out the extent entries in the free space cache */ 1162 /* Write out the extent entries in the free space cache */
1139 ret = write_cache_extent_entries(&io_ctl, ctl, 1163 ret = write_cache_extent_entries(&io_ctl, ctl,
1140 block_group, &entries, &bitmaps, 1164 block_group, &entries, &bitmaps,
1141 &bitmap_list); 1165 &bitmap_list);
1142 if (ret) 1166 if (ret) {
1167 mutex_unlock(&ctl->cache_writeout_mutex);
1143 goto out_nospc; 1168 goto out_nospc;
1169 }
1144 1170
1145 /* 1171 /*
1146 * Some spaces that are freed in the current transaction are pinned, 1172 * Some spaces that are freed in the current transaction are pinned,
@@ -1148,11 +1174,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1148 * committed, we shouldn't lose them. 1174 * committed, we shouldn't lose them.
1149 */ 1175 */
1150 ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); 1176 ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
1151 if (ret) 1177 if (ret) {
1178 mutex_unlock(&ctl->cache_writeout_mutex);
1152 goto out_nospc; 1179 goto out_nospc;
1180 }
1153 1181
1154 /* At last, we write out all the bitmaps. */ 1182 /*
1183 * At last, we write out all the bitmaps and keep cache_writeout_mutex
1184 * locked while doing it because a concurrent trim can be manipulating
1185 * or freeing the bitmap.
1186 */
1155 ret = write_bitmap_entries(&io_ctl, &bitmap_list); 1187 ret = write_bitmap_entries(&io_ctl, &bitmap_list);
1188 mutex_unlock(&ctl->cache_writeout_mutex);
1156 if (ret) 1189 if (ret)
1157 goto out_nospc; 1190 goto out_nospc;
1158 1191
@@ -2295,6 +2328,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
2295 ctl->start = block_group->key.objectid; 2328 ctl->start = block_group->key.objectid;
2296 ctl->private = block_group; 2329 ctl->private = block_group;
2297 ctl->op = &free_space_op; 2330 ctl->op = &free_space_op;
2331 INIT_LIST_HEAD(&ctl->trimming_ranges);
2332 mutex_init(&ctl->cache_writeout_mutex);
2298 2333
2299 /* 2334 /*
2300 * we only want to have 32k of ram per block group for keeping 2335 * we only want to have 32k of ram per block group for keeping
@@ -2911,10 +2946,12 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2911 2946
2912static int do_trimming(struct btrfs_block_group_cache *block_group, 2947static int do_trimming(struct btrfs_block_group_cache *block_group,
2913 u64 *total_trimmed, u64 start, u64 bytes, 2948 u64 *total_trimmed, u64 start, u64 bytes,
2914 u64 reserved_start, u64 reserved_bytes) 2949 u64 reserved_start, u64 reserved_bytes,
2950 struct btrfs_trim_range *trim_entry)
2915{ 2951{
2916 struct btrfs_space_info *space_info = block_group->space_info; 2952 struct btrfs_space_info *space_info = block_group->space_info;
2917 struct btrfs_fs_info *fs_info = block_group->fs_info; 2953 struct btrfs_fs_info *fs_info = block_group->fs_info;
2954 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2918 int ret; 2955 int ret;
2919 int update = 0; 2956 int update = 0;
2920 u64 trimmed = 0; 2957 u64 trimmed = 0;
@@ -2934,7 +2971,10 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
2934 if (!ret) 2971 if (!ret)
2935 *total_trimmed += trimmed; 2972 *total_trimmed += trimmed;
2936 2973
2974 mutex_lock(&ctl->cache_writeout_mutex);
2937 btrfs_add_free_space(block_group, reserved_start, reserved_bytes); 2975 btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
2976 list_del(&trim_entry->list);
2977 mutex_unlock(&ctl->cache_writeout_mutex);
2938 2978
2939 if (update) { 2979 if (update) {
2940 spin_lock(&space_info->lock); 2980 spin_lock(&space_info->lock);
@@ -2962,16 +3002,21 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2962 u64 bytes; 3002 u64 bytes;
2963 3003
2964 while (start < end) { 3004 while (start < end) {
3005 struct btrfs_trim_range trim_entry;
3006
3007 mutex_lock(&ctl->cache_writeout_mutex);
2965 spin_lock(&ctl->tree_lock); 3008 spin_lock(&ctl->tree_lock);
2966 3009
2967 if (ctl->free_space < minlen) { 3010 if (ctl->free_space < minlen) {
2968 spin_unlock(&ctl->tree_lock); 3011 spin_unlock(&ctl->tree_lock);
3012 mutex_unlock(&ctl->cache_writeout_mutex);
2969 break; 3013 break;
2970 } 3014 }
2971 3015
2972 entry = tree_search_offset(ctl, start, 0, 1); 3016 entry = tree_search_offset(ctl, start, 0, 1);
2973 if (!entry) { 3017 if (!entry) {
2974 spin_unlock(&ctl->tree_lock); 3018 spin_unlock(&ctl->tree_lock);
3019 mutex_unlock(&ctl->cache_writeout_mutex);
2975 break; 3020 break;
2976 } 3021 }
2977 3022
@@ -2980,6 +3025,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2980 node = rb_next(&entry->offset_index); 3025 node = rb_next(&entry->offset_index);
2981 if (!node) { 3026 if (!node) {
2982 spin_unlock(&ctl->tree_lock); 3027 spin_unlock(&ctl->tree_lock);
3028 mutex_unlock(&ctl->cache_writeout_mutex);
2983 goto out; 3029 goto out;
2984 } 3030 }
2985 entry = rb_entry(node, struct btrfs_free_space, 3031 entry = rb_entry(node, struct btrfs_free_space,
@@ -2988,6 +3034,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2988 3034
2989 if (entry->offset >= end) { 3035 if (entry->offset >= end) {
2990 spin_unlock(&ctl->tree_lock); 3036 spin_unlock(&ctl->tree_lock);
3037 mutex_unlock(&ctl->cache_writeout_mutex);
2991 break; 3038 break;
2992 } 3039 }
2993 3040
@@ -2997,6 +3044,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2997 bytes = min(extent_start + extent_bytes, end) - start; 3044 bytes = min(extent_start + extent_bytes, end) - start;
2998 if (bytes < minlen) { 3045 if (bytes < minlen) {
2999 spin_unlock(&ctl->tree_lock); 3046 spin_unlock(&ctl->tree_lock);
3047 mutex_unlock(&ctl->cache_writeout_mutex);
3000 goto next; 3048 goto next;
3001 } 3049 }
3002 3050
@@ -3004,9 +3052,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
3004 kmem_cache_free(btrfs_free_space_cachep, entry); 3052 kmem_cache_free(btrfs_free_space_cachep, entry);
3005 3053
3006 spin_unlock(&ctl->tree_lock); 3054 spin_unlock(&ctl->tree_lock);
3055 trim_entry.start = extent_start;
3056 trim_entry.bytes = extent_bytes;
3057 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3058 mutex_unlock(&ctl->cache_writeout_mutex);
3007 3059
3008 ret = do_trimming(block_group, total_trimmed, start, bytes, 3060 ret = do_trimming(block_group, total_trimmed, start, bytes,
3009 extent_start, extent_bytes); 3061 extent_start, extent_bytes, &trim_entry);
3010 if (ret) 3062 if (ret)
3011 break; 3063 break;
3012next: 3064next:
@@ -3035,17 +3087,21 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3035 3087
3036 while (offset < end) { 3088 while (offset < end) {
3037 bool next_bitmap = false; 3089 bool next_bitmap = false;
3090 struct btrfs_trim_range trim_entry;
3038 3091
3092 mutex_lock(&ctl->cache_writeout_mutex);
3039 spin_lock(&ctl->tree_lock); 3093 spin_lock(&ctl->tree_lock);
3040 3094
3041 if (ctl->free_space < minlen) { 3095 if (ctl->free_space < minlen) {
3042 spin_unlock(&ctl->tree_lock); 3096 spin_unlock(&ctl->tree_lock);
3097 mutex_unlock(&ctl->cache_writeout_mutex);
3043 break; 3098 break;
3044 } 3099 }
3045 3100
3046 entry = tree_search_offset(ctl, offset, 1, 0); 3101 entry = tree_search_offset(ctl, offset, 1, 0);
3047 if (!entry) { 3102 if (!entry) {
3048 spin_unlock(&ctl->tree_lock); 3103 spin_unlock(&ctl->tree_lock);
3104 mutex_unlock(&ctl->cache_writeout_mutex);
3049 next_bitmap = true; 3105 next_bitmap = true;
3050 goto next; 3106 goto next;
3051 } 3107 }
@@ -3054,6 +3110,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3054 ret2 = search_bitmap(ctl, entry, &start, &bytes); 3110 ret2 = search_bitmap(ctl, entry, &start, &bytes);
3055 if (ret2 || start >= end) { 3111 if (ret2 || start >= end) {
3056 spin_unlock(&ctl->tree_lock); 3112 spin_unlock(&ctl->tree_lock);
3113 mutex_unlock(&ctl->cache_writeout_mutex);
3057 next_bitmap = true; 3114 next_bitmap = true;
3058 goto next; 3115 goto next;
3059 } 3116 }
@@ -3061,6 +3118,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3061 bytes = min(bytes, end - start); 3118 bytes = min(bytes, end - start);
3062 if (bytes < minlen) { 3119 if (bytes < minlen) {
3063 spin_unlock(&ctl->tree_lock); 3120 spin_unlock(&ctl->tree_lock);
3121 mutex_unlock(&ctl->cache_writeout_mutex);
3064 goto next; 3122 goto next;
3065 } 3123 }
3066 3124
@@ -3069,9 +3127,13 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3069 free_bitmap(ctl, entry); 3127 free_bitmap(ctl, entry);
3070 3128
3071 spin_unlock(&ctl->tree_lock); 3129 spin_unlock(&ctl->tree_lock);
3130 trim_entry.start = start;
3131 trim_entry.bytes = bytes;
3132 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3133 mutex_unlock(&ctl->cache_writeout_mutex);
3072 3134
3073 ret = do_trimming(block_group, total_trimmed, start, bytes, 3135 ret = do_trimming(block_group, total_trimmed, start, bytes,
3074 start, bytes); 3136 start, bytes, &trim_entry);
3075 if (ret) 3137 if (ret)
3076 break; 3138 break;
3077next: 3139next:
@@ -3101,11 +3163,52 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
3101 3163
3102 *trimmed = 0; 3164 *trimmed = 0;
3103 3165
3166 spin_lock(&block_group->lock);
3167 if (block_group->removed) {
3168 spin_unlock(&block_group->lock);
3169 return 0;
3170 }
3171 atomic_inc(&block_group->trimming);
3172 spin_unlock(&block_group->lock);
3173
3104 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); 3174 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
3105 if (ret) 3175 if (ret)
3106 return ret; 3176 goto out;
3107 3177
3108 ret = trim_bitmaps(block_group, trimmed, start, end, minlen); 3178 ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
3179out:
3180 spin_lock(&block_group->lock);
3181 if (atomic_dec_and_test(&block_group->trimming) &&
3182 block_group->removed) {
3183 struct extent_map_tree *em_tree;
3184 struct extent_map *em;
3185
3186 spin_unlock(&block_group->lock);
3187
3188 em_tree = &block_group->fs_info->mapping_tree.map_tree;
3189 write_lock(&em_tree->lock);
3190 em = lookup_extent_mapping(em_tree, block_group->key.objectid,
3191 1);
3192 BUG_ON(!em); /* logic error, can't happen */
3193 remove_extent_mapping(em_tree, em);
3194 write_unlock(&em_tree->lock);
3195
3196 lock_chunks(block_group->fs_info->chunk_root);
3197 list_del_init(&em->list);
3198 unlock_chunks(block_group->fs_info->chunk_root);
3199
3200 /* once for us and once for the tree */
3201 free_extent_map(em);
3202 free_extent_map(em);
3203
3204 /*
3205 * We've left one free space entry and other tasks trimming
3206 * this block group have left 1 entry each one. Free them.
3207 */
3208 __btrfs_remove_free_space_cache(block_group->free_space_ctl);
3209 } else {
3210 spin_unlock(&block_group->lock);
3211 }
3109 3212
3110 return ret; 3213 return ret;
3111} 3214}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 0cf4977ef70d..88b2238a0aed 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -38,6 +38,8 @@ struct btrfs_free_space_ctl {
38 u64 start; 38 u64 start;
39 struct btrfs_free_space_op *op; 39 struct btrfs_free_space_op *op;
40 void *private; 40 void *private;
41 struct mutex cache_writeout_mutex;
42 struct list_head trimming_ranges;
41}; 43};
42 44
43struct btrfs_free_space_op { 45struct btrfs_free_space_op {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 83d646bd2e4b..74faea3a516e 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root)
178 root->root_key.objectid); 178 root->root_key.objectid);
179 if (IS_ERR(tsk)) { 179 if (IS_ERR(tsk)) {
180 btrfs_warn(root->fs_info, "failed to start inode caching task"); 180 btrfs_warn(root->fs_info, "failed to start inode caching task");
181 btrfs_clear_and_info(root, CHANGE_INODE_CACHE, 181 btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE,
182 "disabling inode map caching"); 182 "disabling inode map caching");
183 } 183 }
184} 184}
@@ -364,6 +364,8 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root)
364 ctl->start = 0; 364 ctl->start = 0;
365 ctl->private = NULL; 365 ctl->private = NULL;
366 ctl->op = &free_ino_op; 366 ctl->op = &free_ino_op;
367 INIT_LIST_HEAD(&ctl->trimming_ranges);
368 mutex_init(&ctl->cache_writeout_mutex);
367 369
368 /* 370 /*
369 * Initially we allow to use 16K of ram to cache chunks of 371 * Initially we allow to use 16K of ram to cache chunks of
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ff0dcc016b71..e687bb0dc73a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -382,7 +382,7 @@ static inline int inode_need_compress(struct inode *inode)
382 * are written in the same order that the flusher thread sent them 382 * are written in the same order that the flusher thread sent them
383 * down. 383 * down.
384 */ 384 */
385static noinline int compress_file_range(struct inode *inode, 385static noinline void compress_file_range(struct inode *inode,
386 struct page *locked_page, 386 struct page *locked_page,
387 u64 start, u64 end, 387 u64 start, u64 end,
388 struct async_cow *async_cow, 388 struct async_cow *async_cow,
@@ -411,14 +411,6 @@ static noinline int compress_file_range(struct inode *inode,
411 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) 411 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
412 btrfs_add_inode_defrag(NULL, inode); 412 btrfs_add_inode_defrag(NULL, inode);
413 413
414 /*
415 * skip compression for a small file range(<=blocksize) that
416 * isn't an inline extent, since it dosen't save disk space at all.
417 */
418 if ((end - start + 1) <= blocksize &&
419 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
420 goto cleanup_and_bail_uncompressed;
421
422 actual_end = min_t(u64, isize, end + 1); 414 actual_end = min_t(u64, isize, end + 1);
423again: 415again:
424 will_compress = 0; 416 will_compress = 0;
@@ -440,6 +432,14 @@ again:
440 432
441 total_compressed = actual_end - start; 433 total_compressed = actual_end - start;
442 434
435 /*
436 * skip compression for a small file range(<=blocksize) that
437 * isn't an inline extent, since it dosen't save disk space at all.
438 */
439 if (total_compressed <= blocksize &&
440 (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
441 goto cleanup_and_bail_uncompressed;
442
443 /* we want to make sure that amount of ram required to uncompress 443 /* we want to make sure that amount of ram required to uncompress
444 * an extent is reasonable, so we limit the total size in ram 444 * an extent is reasonable, so we limit the total size in ram
445 * of a compressed extent to 128k. This is a crucial number 445 * of a compressed extent to 128k. This is a crucial number
@@ -527,7 +527,10 @@ cont:
527 if (ret <= 0) { 527 if (ret <= 0) {
528 unsigned long clear_flags = EXTENT_DELALLOC | 528 unsigned long clear_flags = EXTENT_DELALLOC |
529 EXTENT_DEFRAG; 529 EXTENT_DEFRAG;
530 unsigned long page_error_op;
531
530 clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; 532 clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
533 page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
531 534
532 /* 535 /*
533 * inline extent creation worked or returned error, 536 * inline extent creation worked or returned error,
@@ -538,6 +541,7 @@ cont:
538 clear_flags, PAGE_UNLOCK | 541 clear_flags, PAGE_UNLOCK |
539 PAGE_CLEAR_DIRTY | 542 PAGE_CLEAR_DIRTY |
540 PAGE_SET_WRITEBACK | 543 PAGE_SET_WRITEBACK |
544 page_error_op |
541 PAGE_END_WRITEBACK); 545 PAGE_END_WRITEBACK);
542 goto free_pages_out; 546 goto free_pages_out;
543 } 547 }
@@ -620,8 +624,7 @@ cleanup_and_bail_uncompressed:
620 *num_added += 1; 624 *num_added += 1;
621 } 625 }
622 626
623out: 627 return;
624 return ret;
625 628
626free_pages_out: 629free_pages_out:
627 for (i = 0; i < nr_pages_ret; i++) { 630 for (i = 0; i < nr_pages_ret; i++) {
@@ -629,8 +632,22 @@ free_pages_out:
629 page_cache_release(pages[i]); 632 page_cache_release(pages[i]);
630 } 633 }
631 kfree(pages); 634 kfree(pages);
635}
632 636
633 goto out; 637static void free_async_extent_pages(struct async_extent *async_extent)
638{
639 int i;
640
641 if (!async_extent->pages)
642 return;
643
644 for (i = 0; i < async_extent->nr_pages; i++) {
645 WARN_ON(async_extent->pages[i]->mapping);
646 page_cache_release(async_extent->pages[i]);
647 }
648 kfree(async_extent->pages);
649 async_extent->nr_pages = 0;
650 async_extent->pages = NULL;
634} 651}
635 652
636/* 653/*
@@ -639,7 +656,7 @@ free_pages_out:
639 * queued. We walk all the async extents created by compress_file_range 656 * queued. We walk all the async extents created by compress_file_range
640 * and send them down to the disk. 657 * and send them down to the disk.
641 */ 658 */
642static noinline int submit_compressed_extents(struct inode *inode, 659static noinline void submit_compressed_extents(struct inode *inode,
643 struct async_cow *async_cow) 660 struct async_cow *async_cow)
644{ 661{
645 struct async_extent *async_extent; 662 struct async_extent *async_extent;
@@ -651,9 +668,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
651 struct extent_io_tree *io_tree; 668 struct extent_io_tree *io_tree;
652 int ret = 0; 669 int ret = 0;
653 670
654 if (list_empty(&async_cow->extents))
655 return 0;
656
657again: 671again:
658 while (!list_empty(&async_cow->extents)) { 672 while (!list_empty(&async_cow->extents)) {
659 async_extent = list_entry(async_cow->extents.next, 673 async_extent = list_entry(async_cow->extents.next,
@@ -709,15 +723,7 @@ retry:
709 async_extent->compressed_size, 723 async_extent->compressed_size,
710 0, alloc_hint, &ins, 1, 1); 724 0, alloc_hint, &ins, 1, 1);
711 if (ret) { 725 if (ret) {
712 int i; 726 free_async_extent_pages(async_extent);
713
714 for (i = 0; i < async_extent->nr_pages; i++) {
715 WARN_ON(async_extent->pages[i]->mapping);
716 page_cache_release(async_extent->pages[i]);
717 }
718 kfree(async_extent->pages);
719 async_extent->nr_pages = 0;
720 async_extent->pages = NULL;
721 727
722 if (ret == -ENOSPC) { 728 if (ret == -ENOSPC) {
723 unlock_extent(io_tree, async_extent->start, 729 unlock_extent(io_tree, async_extent->start,
@@ -814,15 +820,26 @@ retry:
814 ins.objectid, 820 ins.objectid,
815 ins.offset, async_extent->pages, 821 ins.offset, async_extent->pages,
816 async_extent->nr_pages); 822 async_extent->nr_pages);
823 if (ret) {
824 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
825 struct page *p = async_extent->pages[0];
826 const u64 start = async_extent->start;
827 const u64 end = start + async_extent->ram_size - 1;
828
829 p->mapping = inode->i_mapping;
830 tree->ops->writepage_end_io_hook(p, start, end,
831 NULL, 0);
832 p->mapping = NULL;
833 extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
834 PAGE_END_WRITEBACK |
835 PAGE_SET_ERROR);
836 free_async_extent_pages(async_extent);
837 }
817 alloc_hint = ins.objectid + ins.offset; 838 alloc_hint = ins.objectid + ins.offset;
818 kfree(async_extent); 839 kfree(async_extent);
819 if (ret)
820 goto out;
821 cond_resched(); 840 cond_resched();
822 } 841 }
823 ret = 0; 842 return;
824out:
825 return ret;
826out_free_reserve: 843out_free_reserve:
827 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 844 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
828out_free: 845out_free:
@@ -832,7 +849,9 @@ out_free:
832 NULL, EXTENT_LOCKED | EXTENT_DELALLOC | 849 NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
833 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 850 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
834 PAGE_UNLOCK | PAGE_CLEAR_DIRTY | 851 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
835 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); 852 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
853 PAGE_SET_ERROR);
854 free_async_extent_pages(async_extent);
836 kfree(async_extent); 855 kfree(async_extent);
837 goto again; 856 goto again;
838} 857}
@@ -1318,7 +1337,7 @@ next_slot:
1318 * we fall into common COW way. 1337 * we fall into common COW way.
1319 */ 1338 */
1320 if (!nolock) { 1339 if (!nolock) {
1321 err = btrfs_start_nocow_write(root); 1340 err = btrfs_start_write_no_snapshoting(root);
1322 if (!err) 1341 if (!err)
1323 goto out_check; 1342 goto out_check;
1324 } 1343 }
@@ -1342,7 +1361,7 @@ out_check:
1342 if (extent_end <= start) { 1361 if (extent_end <= start) {
1343 path->slots[0]++; 1362 path->slots[0]++;
1344 if (!nolock && nocow) 1363 if (!nolock && nocow)
1345 btrfs_end_nocow_write(root); 1364 btrfs_end_write_no_snapshoting(root);
1346 goto next_slot; 1365 goto next_slot;
1347 } 1366 }
1348 if (!nocow) { 1367 if (!nocow) {
@@ -1362,7 +1381,7 @@ out_check:
1362 page_started, nr_written, 1); 1381 page_started, nr_written, 1);
1363 if (ret) { 1382 if (ret) {
1364 if (!nolock && nocow) 1383 if (!nolock && nocow)
1365 btrfs_end_nocow_write(root); 1384 btrfs_end_write_no_snapshoting(root);
1366 goto error; 1385 goto error;
1367 } 1386 }
1368 cow_start = (u64)-1; 1387 cow_start = (u64)-1;
@@ -1413,7 +1432,7 @@ out_check:
1413 num_bytes); 1432 num_bytes);
1414 if (ret) { 1433 if (ret) {
1415 if (!nolock && nocow) 1434 if (!nolock && nocow)
1416 btrfs_end_nocow_write(root); 1435 btrfs_end_write_no_snapshoting(root);
1417 goto error; 1436 goto error;
1418 } 1437 }
1419 } 1438 }
@@ -1424,7 +1443,7 @@ out_check:
1424 EXTENT_DELALLOC, PAGE_UNLOCK | 1443 EXTENT_DELALLOC, PAGE_UNLOCK |
1425 PAGE_SET_PRIVATE2); 1444 PAGE_SET_PRIVATE2);
1426 if (!nolock && nocow) 1445 if (!nolock && nocow)
1427 btrfs_end_nocow_write(root); 1446 btrfs_end_write_no_snapshoting(root);
1428 cur_offset = extent_end; 1447 cur_offset = extent_end;
1429 if (cur_offset > end) 1448 if (cur_offset > end)
1430 break; 1449 break;
@@ -4580,6 +4599,26 @@ next:
4580 return err; 4599 return err;
4581} 4600}
4582 4601
4602static int wait_snapshoting_atomic_t(atomic_t *a)
4603{
4604 schedule();
4605 return 0;
4606}
4607
4608static void wait_for_snapshot_creation(struct btrfs_root *root)
4609{
4610 while (true) {
4611 int ret;
4612
4613 ret = btrfs_start_write_no_snapshoting(root);
4614 if (ret)
4615 break;
4616 wait_on_atomic_t(&root->will_be_snapshoted,
4617 wait_snapshoting_atomic_t,
4618 TASK_UNINTERRUPTIBLE);
4619 }
4620}
4621
4583static int btrfs_setsize(struct inode *inode, struct iattr *attr) 4622static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4584{ 4623{
4585 struct btrfs_root *root = BTRFS_I(inode)->root; 4624 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4604,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4604 4643
4605 if (newsize > oldsize) { 4644 if (newsize > oldsize) {
4606 truncate_pagecache(inode, newsize); 4645 truncate_pagecache(inode, newsize);
4646 /*
4647 * Don't do an expanding truncate while snapshoting is ongoing.
4648 * This is to ensure the snapshot captures a fully consistent
4649 * state of this file - if the snapshot captures this expanding
4650 * truncation, it must capture all writes that happened before
4651 * this truncation.
4652 */
4653 wait_for_snapshot_creation(root);
4607 ret = btrfs_cont_expand(inode, oldsize, newsize); 4654 ret = btrfs_cont_expand(inode, oldsize, newsize);
4608 if (ret) 4655 if (ret) {
4656 btrfs_end_write_no_snapshoting(root);
4609 return ret; 4657 return ret;
4658 }
4610 4659
4611 trans = btrfs_start_transaction(root, 1); 4660 trans = btrfs_start_transaction(root, 1);
4612 if (IS_ERR(trans)) 4661 if (IS_ERR(trans)) {
4662 btrfs_end_write_no_snapshoting(root);
4613 return PTR_ERR(trans); 4663 return PTR_ERR(trans);
4664 }
4614 4665
4615 i_size_write(inode, newsize); 4666 i_size_write(inode, newsize);
4616 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 4667 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4617 ret = btrfs_update_inode(trans, root, inode); 4668 ret = btrfs_update_inode(trans, root, inode);
4669 btrfs_end_write_no_snapshoting(root);
4618 btrfs_end_transaction(trans, root); 4670 btrfs_end_transaction(trans, root);
4619 } else { 4671 } else {
4620 4672
@@ -7000,9 +7052,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7000 btrfs_put_ordered_extent(ordered); 7052 btrfs_put_ordered_extent(ordered);
7001 } else { 7053 } else {
7002 /* Screw you mmap */ 7054 /* Screw you mmap */
7003 ret = filemap_write_and_wait_range(inode->i_mapping, 7055 ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
7004 lockstart, 7056 if (ret)
7005 lockend); 7057 break;
7058 ret = filemap_fdatawait_range(inode->i_mapping,
7059 lockstart,
7060 lockend);
7006 if (ret) 7061 if (ret)
7007 break; 7062 break;
7008 7063
@@ -9442,6 +9497,21 @@ out_inode:
9442 9497
9443} 9498}
9444 9499
9500/* Inspired by filemap_check_errors() */
9501int btrfs_inode_check_errors(struct inode *inode)
9502{
9503 int ret = 0;
9504
9505 if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
9506 test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
9507 ret = -ENOSPC;
9508 if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
9509 test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
9510 ret = -EIO;
9511
9512 return ret;
9513}
9514
9445static const struct inode_operations btrfs_dir_inode_operations = { 9515static const struct inode_operations btrfs_dir_inode_operations = {
9446 .getattr = btrfs_getattr, 9516 .getattr = btrfs_getattr,
9447 .lookup = btrfs_lookup, 9517 .lookup = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 080fe66c0349..d49fe8a0f6b5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -617,7 +617,7 @@ fail:
617 return ret; 617 return ret;
618} 618}
619 619
620static void btrfs_wait_nocow_write(struct btrfs_root *root) 620static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
621{ 621{
622 s64 writers; 622 s64 writers;
623 DEFINE_WAIT(wait); 623 DEFINE_WAIT(wait);
@@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
649 649
650 atomic_inc(&root->will_be_snapshoted); 650 atomic_inc(&root->will_be_snapshoted);
651 smp_mb__after_atomic(); 651 smp_mb__after_atomic();
652 btrfs_wait_nocow_write(root); 652 btrfs_wait_for_no_snapshoting_writes(root);
653 653
654 ret = btrfs_start_delalloc_inodes(root, 0); 654 ret = btrfs_start_delalloc_inodes(root, 0);
655 if (ret) 655 if (ret)
@@ -717,35 +717,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
717 if (ret) 717 if (ret)
718 goto fail; 718 goto fail;
719 719
720 /*
721 * If orphan cleanup did remove any orphans, it means the tree was
722 * modified and therefore the commit root is not the same as the
723 * current root anymore. This is a problem, because send uses the
724 * commit root and therefore can see inode items that don't exist
725 * in the current root anymore, and for example make calls to
726 * btrfs_iget, which will do tree lookups based on the current root
727 * and not on the commit root. Those lookups will fail, returning a
728 * -ESTALE error, and making send fail with that error. So make sure
729 * a send does not see any orphans we have just removed, and that it
730 * will see the same inodes regardless of whether a transaction
731 * commit happened before it started (meaning that the commit root
732 * will be the same as the current root) or not.
733 */
734 if (readonly && pending_snapshot->snap->node !=
735 pending_snapshot->snap->commit_root) {
736 trans = btrfs_join_transaction(pending_snapshot->snap);
737 if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
738 ret = PTR_ERR(trans);
739 goto fail;
740 }
741 if (!IS_ERR(trans)) {
742 ret = btrfs_commit_transaction(trans,
743 pending_snapshot->snap);
744 if (ret)
745 goto fail;
746 }
747 }
748
749 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 720 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
750 if (IS_ERR(inode)) { 721 if (IS_ERR(inode)) {
751 ret = PTR_ERR(inode); 722 ret = PTR_ERR(inode);
@@ -761,7 +732,8 @@ fail:
761free: 732free:
762 kfree(pending_snapshot); 733 kfree(pending_snapshot);
763out: 734out:
764 atomic_dec(&root->will_be_snapshoted); 735 if (atomic_dec_and_test(&root->will_be_snapshoted))
736 wake_up_atomic_t(&root->will_be_snapshoted);
765 return ret; 737 return ret;
766} 738}
767 739
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ac734ec4cc20..534544e08f76 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 INIT_LIST_HEAD(&entry->work_list); 220 INIT_LIST_HEAD(&entry->work_list);
221 init_completion(&entry->completion); 221 init_completion(&entry->completion);
222 INIT_LIST_HEAD(&entry->log_list); 222 INIT_LIST_HEAD(&entry->log_list);
223 INIT_LIST_HEAD(&entry->trans_list);
223 224
224 trace_btrfs_ordered_extent_add(inode, entry); 225 trace_btrfs_ordered_extent_add(inode, entry);
225 226
@@ -431,19 +432,31 @@ out:
431 432
432/* Needs to either be called under a log transaction or the log_mutex */ 433/* Needs to either be called under a log transaction or the log_mutex */
433void btrfs_get_logged_extents(struct inode *inode, 434void btrfs_get_logged_extents(struct inode *inode,
434 struct list_head *logged_list) 435 struct list_head *logged_list,
436 const loff_t start,
437 const loff_t end)
435{ 438{
436 struct btrfs_ordered_inode_tree *tree; 439 struct btrfs_ordered_inode_tree *tree;
437 struct btrfs_ordered_extent *ordered; 440 struct btrfs_ordered_extent *ordered;
438 struct rb_node *n; 441 struct rb_node *n;
442 struct rb_node *prev;
439 443
440 tree = &BTRFS_I(inode)->ordered_tree; 444 tree = &BTRFS_I(inode)->ordered_tree;
441 spin_lock_irq(&tree->lock); 445 spin_lock_irq(&tree->lock);
442 for (n = rb_first(&tree->tree); n; n = rb_next(n)) { 446 n = __tree_search(&tree->tree, end, &prev);
447 if (!n)
448 n = prev;
449 for (; n; n = rb_prev(n)) {
443 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); 450 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
451 if (ordered->file_offset > end)
452 continue;
453 if (entry_end(ordered) <= start)
454 break;
444 if (!list_empty(&ordered->log_list)) 455 if (!list_empty(&ordered->log_list))
445 continue; 456 continue;
446 list_add_tail(&ordered->log_list, logged_list); 457 if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
458 continue;
459 list_add(&ordered->log_list, logged_list);
447 atomic_inc(&ordered->refs); 460 atomic_inc(&ordered->refs);
448 } 461 }
449 spin_unlock_irq(&tree->lock); 462 spin_unlock_irq(&tree->lock);
@@ -472,7 +485,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list,
472 spin_unlock_irq(&log->log_extents_lock[index]); 485 spin_unlock_irq(&log->log_extents_lock[index]);
473} 486}
474 487
475void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) 488void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
489 struct btrfs_root *log, u64 transid)
476{ 490{
477 struct btrfs_ordered_extent *ordered; 491 struct btrfs_ordered_extent *ordered;
478 int index = transid % 2; 492 int index = transid % 2;
@@ -497,7 +511,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
497 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, 511 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
498 &ordered->flags)); 512 &ordered->flags));
499 513
500 btrfs_put_ordered_extent(ordered); 514 if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
515 list_add_tail(&ordered->trans_list, &trans->ordered);
501 spin_lock_irq(&log->log_extents_lock[index]); 516 spin_lock_irq(&log->log_extents_lock[index]);
502 } 517 }
503 spin_unlock_irq(&log->log_extents_lock[index]); 518 spin_unlock_irq(&log->log_extents_lock[index]);
@@ -725,30 +740,10 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
725 /* start IO across the range first to instantiate any delalloc 740 /* start IO across the range first to instantiate any delalloc
726 * extents 741 * extents
727 */ 742 */
728 ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end); 743 ret = btrfs_fdatawrite_range(inode, start, orig_end);
729 if (ret) 744 if (ret)
730 return ret; 745 return ret;
731 /* 746
732 * So with compression we will find and lock a dirty page and clear the
733 * first one as dirty, setup an async extent, and immediately return
734 * with the entire range locked but with nobody actually marked with
735 * writeback. So we can't just filemap_write_and_wait_range() and
736 * expect it to work since it will just kick off a thread to do the
737 * actual work. So we need to call filemap_fdatawrite_range _again_
738 * since it will wait on the page lock, which won't be unlocked until
739 * after the pages have been marked as writeback and so we're good to go
740 * from there. We have to do this otherwise we'll miss the ordered
741 * extents and that results in badness. Please Josef, do not think you
742 * know better and pull this out at some point in the future, it is
743 * right and you are wrong.
744 */
745 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
746 &BTRFS_I(inode)->runtime_flags)) {
747 ret = filemap_fdatawrite_range(inode->i_mapping, start,
748 orig_end);
749 if (ret)
750 return ret;
751 }
752 ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); 747 ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
753 if (ret) 748 if (ret)
754 return ret; 749 return ret;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d81a274d621e..e96cd4ccd805 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -71,6 +71,8 @@ struct btrfs_ordered_sum {
71 ordered extent */ 71 ordered extent */
72#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ 72#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
73 73
74#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
75 * in the logging code. */
74struct btrfs_ordered_extent { 76struct btrfs_ordered_extent {
75 /* logical offset in the file */ 77 /* logical offset in the file */
76 u64 file_offset; 78 u64 file_offset;
@@ -121,6 +123,9 @@ struct btrfs_ordered_extent {
121 /* If we need to wait on this to be done */ 123 /* If we need to wait on this to be done */
122 struct list_head log_list; 124 struct list_head log_list;
123 125
126 /* If the transaction needs to wait on this ordered extent */
127 struct list_head trans_list;
128
124 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ 129 /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
125 wait_queue_head_t wait; 130 wait_queue_head_t wait;
126 131
@@ -193,11 +198,14 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
193int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); 198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
194void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); 199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
195void btrfs_get_logged_extents(struct inode *inode, 200void btrfs_get_logged_extents(struct inode *inode,
196 struct list_head *logged_list); 201 struct list_head *logged_list,
202 const loff_t start,
203 const loff_t end);
197void btrfs_put_logged_extents(struct list_head *logged_list); 204void btrfs_put_logged_extents(struct list_head *logged_list);
198void btrfs_submit_logged_extents(struct list_head *logged_list, 205void btrfs_submit_logged_extents(struct list_head *logged_list,
199 struct btrfs_root *log); 206 struct btrfs_root *log);
200void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); 207void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
208 struct btrfs_root *log, u64 transid);
201void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); 209void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
202int __init ordered_data_init(void); 210int __init ordered_data_init(void);
203void ordered_data_exit(void); 211void ordered_data_exit(void);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 6a41631cb959..8ab2a17bbba8 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,9 +58,23 @@
58 */ 58 */
59#define RBIO_CACHE_READY_BIT 3 59#define RBIO_CACHE_READY_BIT 3
60 60
61/*
62 * bbio and raid_map is managed by the caller, so we shouldn't free
63 * them here. And besides that, all rbios with this flag should not
64 * be cached, because we need raid_map to check the rbios' stripe
65 * is the same or not, but it is very likely that the caller has
66 * free raid_map, so don't cache those rbios.
67 */
68#define RBIO_HOLD_BBIO_MAP_BIT 4
61 69
62#define RBIO_CACHE_SIZE 1024 70#define RBIO_CACHE_SIZE 1024
63 71
72enum btrfs_rbio_ops {
73 BTRFS_RBIO_WRITE = 0,
74 BTRFS_RBIO_READ_REBUILD = 1,
75 BTRFS_RBIO_PARITY_SCRUB = 2,
76};
77
64struct btrfs_raid_bio { 78struct btrfs_raid_bio {
65 struct btrfs_fs_info *fs_info; 79 struct btrfs_fs_info *fs_info;
66 struct btrfs_bio *bbio; 80 struct btrfs_bio *bbio;
@@ -117,13 +131,16 @@ struct btrfs_raid_bio {
117 /* number of data stripes (no p/q) */ 131 /* number of data stripes (no p/q) */
118 int nr_data; 132 int nr_data;
119 133
134 int real_stripes;
135
136 int stripe_npages;
120 /* 137 /*
121 * set if we're doing a parity rebuild 138 * set if we're doing a parity rebuild
122 * for a read from higher up, which is handled 139 * for a read from higher up, which is handled
123 * differently from a parity rebuild as part of 140 * differently from a parity rebuild as part of
124 * rmw 141 * rmw
125 */ 142 */
126 int read_rebuild; 143 enum btrfs_rbio_ops operation;
127 144
128 /* first bad stripe */ 145 /* first bad stripe */
129 int faila; 146 int faila;
@@ -131,6 +148,7 @@ struct btrfs_raid_bio {
131 /* second bad stripe (for raid6 use) */ 148 /* second bad stripe (for raid6 use) */
132 int failb; 149 int failb;
133 150
151 int scrubp;
134 /* 152 /*
135 * number of pages needed to represent the full 153 * number of pages needed to represent the full
136 * stripe 154 * stripe
@@ -144,8 +162,13 @@ struct btrfs_raid_bio {
144 */ 162 */
145 int bio_list_bytes; 163 int bio_list_bytes;
146 164
165 int generic_bio_cnt;
166
147 atomic_t refs; 167 atomic_t refs;
148 168
169 atomic_t stripes_pending;
170
171 atomic_t error;
149 /* 172 /*
150 * these are two arrays of pointers. We allocate the 173 * these are two arrays of pointers. We allocate the
151 * rbio big enough to hold them both and setup their 174 * rbio big enough to hold them both and setup their
@@ -162,6 +185,11 @@ struct btrfs_raid_bio {
162 * here for faster lookup 185 * here for faster lookup
163 */ 186 */
164 struct page **bio_pages; 187 struct page **bio_pages;
188
189 /*
190 * bitmap to record which horizontal stripe has data
191 */
192 unsigned long *dbitmap;
165}; 193};
166 194
167static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 195static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
@@ -176,6 +204,10 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio);
176static void index_rbio_pages(struct btrfs_raid_bio *rbio); 204static void index_rbio_pages(struct btrfs_raid_bio *rbio);
177static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 205static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
178 206
207static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
208 int need_check);
209static void async_scrub_parity(struct btrfs_raid_bio *rbio);
210
179/* 211/*
180 * the stripe hash table is used for locking, and to collect 212 * the stripe hash table is used for locking, and to collect
181 * bios in hopes of making a full stripe 213 * bios in hopes of making a full stripe
@@ -324,6 +356,7 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
324{ 356{
325 bio_list_merge(&dest->bio_list, &victim->bio_list); 357 bio_list_merge(&dest->bio_list, &victim->bio_list);
326 dest->bio_list_bytes += victim->bio_list_bytes; 358 dest->bio_list_bytes += victim->bio_list_bytes;
359 dest->generic_bio_cnt += victim->generic_bio_cnt;
327 bio_list_init(&victim->bio_list); 360 bio_list_init(&victim->bio_list);
328} 361}
329 362
@@ -577,11 +610,20 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
577 cur->raid_map[0]) 610 cur->raid_map[0])
578 return 0; 611 return 0;
579 612
580 /* reads can't merge with writes */ 613 /* we can't merge with different operations */
581 if (last->read_rebuild != 614 if (last->operation != cur->operation)
582 cur->read_rebuild) { 615 return 0;
616 /*
617 * We've need read the full stripe from the drive.
618 * check and repair the parity and write the new results.
619 *
620 * We're not allowed to add any new bios to the
621 * bio list here, anyone else that wants to
622 * change this stripe needs to do their own rmw.
623 */
624 if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
625 cur->operation == BTRFS_RBIO_PARITY_SCRUB)
583 return 0; 626 return 0;
584 }
585 627
586 return 1; 628 return 1;
587} 629}
@@ -601,7 +643,7 @@ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
601 */ 643 */
602static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 644static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
603{ 645{
604 if (rbio->nr_data + 1 == rbio->bbio->num_stripes) 646 if (rbio->nr_data + 1 == rbio->real_stripes)
605 return NULL; 647 return NULL;
606 648
607 index += ((rbio->nr_data + 1) * rbio->stripe_len) >> 649 index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
@@ -772,11 +814,14 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
772 spin_unlock(&rbio->bio_list_lock); 814 spin_unlock(&rbio->bio_list_lock);
773 spin_unlock_irqrestore(&h->lock, flags); 815 spin_unlock_irqrestore(&h->lock, flags);
774 816
775 if (next->read_rebuild) 817 if (next->operation == BTRFS_RBIO_READ_REBUILD)
776 async_read_rebuild(next); 818 async_read_rebuild(next);
777 else { 819 else if (next->operation == BTRFS_RBIO_WRITE) {
778 steal_rbio(rbio, next); 820 steal_rbio(rbio, next);
779 async_rmw_stripe(next); 821 async_rmw_stripe(next);
822 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
823 steal_rbio(rbio, next);
824 async_scrub_parity(next);
780 } 825 }
781 826
782 goto done_nolock; 827 goto done_nolock;
@@ -796,6 +841,21 @@ done_nolock:
796 remove_rbio_from_cache(rbio); 841 remove_rbio_from_cache(rbio);
797} 842}
798 843
844static inline void
845__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
846{
847 if (need) {
848 kfree(raid_map);
849 kfree(bbio);
850 }
851}
852
853static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
854{
855 __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
856 !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
857}
858
799static void __free_raid_bio(struct btrfs_raid_bio *rbio) 859static void __free_raid_bio(struct btrfs_raid_bio *rbio)
800{ 860{
801 int i; 861 int i;
@@ -814,8 +874,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
814 rbio->stripe_pages[i] = NULL; 874 rbio->stripe_pages[i] = NULL;
815 } 875 }
816 } 876 }
817 kfree(rbio->raid_map); 877
818 kfree(rbio->bbio); 878 free_bbio_and_raid_map(rbio);
879
819 kfree(rbio); 880 kfree(rbio);
820} 881}
821 882
@@ -833,6 +894,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
833{ 894{
834 struct bio *cur = bio_list_get(&rbio->bio_list); 895 struct bio *cur = bio_list_get(&rbio->bio_list);
835 struct bio *next; 896 struct bio *next;
897
898 if (rbio->generic_bio_cnt)
899 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
900
836 free_raid_bio(rbio); 901 free_raid_bio(rbio);
837 902
838 while (cur) { 903 while (cur) {
@@ -858,13 +923,13 @@ static void raid_write_end_io(struct bio *bio, int err)
858 923
859 bio_put(bio); 924 bio_put(bio);
860 925
861 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 926 if (!atomic_dec_and_test(&rbio->stripes_pending))
862 return; 927 return;
863 928
864 err = 0; 929 err = 0;
865 930
866 /* OK, we have read all the stripes we need to. */ 931 /* OK, we have read all the stripes we need to. */
867 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 932 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
868 err = -EIO; 933 err = -EIO;
869 934
870 rbio_orig_end_io(rbio, err, 0); 935 rbio_orig_end_io(rbio, err, 0);
@@ -925,16 +990,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
925{ 990{
926 struct btrfs_raid_bio *rbio; 991 struct btrfs_raid_bio *rbio;
927 int nr_data = 0; 992 int nr_data = 0;
928 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); 993 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
994 int num_pages = rbio_nr_pages(stripe_len, real_stripes);
995 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
929 void *p; 996 void *p;
930 997
931 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, 998 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
999 DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
932 GFP_NOFS); 1000 GFP_NOFS);
933 if (!rbio) { 1001 if (!rbio)
934 kfree(raid_map);
935 kfree(bbio);
936 return ERR_PTR(-ENOMEM); 1002 return ERR_PTR(-ENOMEM);
937 }
938 1003
939 bio_list_init(&rbio->bio_list); 1004 bio_list_init(&rbio->bio_list);
940 INIT_LIST_HEAD(&rbio->plug_list); 1005 INIT_LIST_HEAD(&rbio->plug_list);
@@ -946,9 +1011,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
946 rbio->fs_info = root->fs_info; 1011 rbio->fs_info = root->fs_info;
947 rbio->stripe_len = stripe_len; 1012 rbio->stripe_len = stripe_len;
948 rbio->nr_pages = num_pages; 1013 rbio->nr_pages = num_pages;
1014 rbio->real_stripes = real_stripes;
1015 rbio->stripe_npages = stripe_npages;
949 rbio->faila = -1; 1016 rbio->faila = -1;
950 rbio->failb = -1; 1017 rbio->failb = -1;
951 atomic_set(&rbio->refs, 1); 1018 atomic_set(&rbio->refs, 1);
1019 atomic_set(&rbio->error, 0);
1020 atomic_set(&rbio->stripes_pending, 0);
952 1021
953 /* 1022 /*
954 * the stripe_pages and bio_pages array point to the extra 1023 * the stripe_pages and bio_pages array point to the extra
@@ -957,11 +1026,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
957 p = rbio + 1; 1026 p = rbio + 1;
958 rbio->stripe_pages = p; 1027 rbio->stripe_pages = p;
959 rbio->bio_pages = p + sizeof(struct page *) * num_pages; 1028 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
1029 rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
960 1030
961 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) 1031 if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
962 nr_data = bbio->num_stripes - 2; 1032 nr_data = real_stripes - 2;
963 else 1033 else
964 nr_data = bbio->num_stripes - 1; 1034 nr_data = real_stripes - 1;
965 1035
966 rbio->nr_data = nr_data; 1036 rbio->nr_data = nr_data;
967 return rbio; 1037 return rbio;
@@ -1073,7 +1143,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1073static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1143static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1074{ 1144{
1075 if (rbio->faila >= 0 || rbio->failb >= 0) { 1145 if (rbio->faila >= 0 || rbio->failb >= 0) {
1076 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); 1146 BUG_ON(rbio->faila == rbio->real_stripes - 1);
1077 __raid56_parity_recover(rbio); 1147 __raid56_parity_recover(rbio);
1078 } else { 1148 } else {
1079 finish_rmw(rbio); 1149 finish_rmw(rbio);
@@ -1134,7 +1204,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1134static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1204static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1135{ 1205{
1136 struct btrfs_bio *bbio = rbio->bbio; 1206 struct btrfs_bio *bbio = rbio->bbio;
1137 void *pointers[bbio->num_stripes]; 1207 void *pointers[rbio->real_stripes];
1138 int stripe_len = rbio->stripe_len; 1208 int stripe_len = rbio->stripe_len;
1139 int nr_data = rbio->nr_data; 1209 int nr_data = rbio->nr_data;
1140 int stripe; 1210 int stripe;
@@ -1148,11 +1218,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1148 1218
1149 bio_list_init(&bio_list); 1219 bio_list_init(&bio_list);
1150 1220
1151 if (bbio->num_stripes - rbio->nr_data == 1) { 1221 if (rbio->real_stripes - rbio->nr_data == 1) {
1152 p_stripe = bbio->num_stripes - 1; 1222 p_stripe = rbio->real_stripes - 1;
1153 } else if (bbio->num_stripes - rbio->nr_data == 2) { 1223 } else if (rbio->real_stripes - rbio->nr_data == 2) {
1154 p_stripe = bbio->num_stripes - 2; 1224 p_stripe = rbio->real_stripes - 2;
1155 q_stripe = bbio->num_stripes - 1; 1225 q_stripe = rbio->real_stripes - 1;
1156 } else { 1226 } else {
1157 BUG(); 1227 BUG();
1158 } 1228 }
@@ -1169,7 +1239,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1169 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1239 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1170 spin_unlock_irq(&rbio->bio_list_lock); 1240 spin_unlock_irq(&rbio->bio_list_lock);
1171 1241
1172 atomic_set(&rbio->bbio->error, 0); 1242 atomic_set(&rbio->error, 0);
1173 1243
1174 /* 1244 /*
1175 * now that we've set rmw_locked, run through the 1245 * now that we've set rmw_locked, run through the
@@ -1209,7 +1279,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1209 SetPageUptodate(p); 1279 SetPageUptodate(p);
1210 pointers[stripe++] = kmap(p); 1280 pointers[stripe++] = kmap(p);
1211 1281
1212 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, 1282 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1213 pointers); 1283 pointers);
1214 } else { 1284 } else {
1215 /* raid5 */ 1285 /* raid5 */
@@ -1218,7 +1288,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1218 } 1288 }
1219 1289
1220 1290
1221 for (stripe = 0; stripe < bbio->num_stripes; stripe++) 1291 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
1222 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 1292 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1223 } 1293 }
1224 1294
@@ -1227,7 +1297,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1227 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1297 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1228 * everything else. 1298 * everything else.
1229 */ 1299 */
1230 for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 1300 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1231 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1301 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1232 struct page *page; 1302 struct page *page;
1233 if (stripe < rbio->nr_data) { 1303 if (stripe < rbio->nr_data) {
@@ -1245,8 +1315,34 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1245 } 1315 }
1246 } 1316 }
1247 1317
1248 atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); 1318 if (likely(!bbio->num_tgtdevs))
1249 BUG_ON(atomic_read(&bbio->stripes_pending) == 0); 1319 goto write_data;
1320
1321 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1322 if (!bbio->tgtdev_map[stripe])
1323 continue;
1324
1325 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1326 struct page *page;
1327 if (stripe < rbio->nr_data) {
1328 page = page_in_rbio(rbio, stripe, pagenr, 1);
1329 if (!page)
1330 continue;
1331 } else {
1332 page = rbio_stripe_page(rbio, stripe, pagenr);
1333 }
1334
1335 ret = rbio_add_io_page(rbio, &bio_list, page,
1336 rbio->bbio->tgtdev_map[stripe],
1337 pagenr, rbio->stripe_len);
1338 if (ret)
1339 goto cleanup;
1340 }
1341 }
1342
1343write_data:
1344 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1345 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1250 1346
1251 while (1) { 1347 while (1) {
1252 bio = bio_list_pop(&bio_list); 1348 bio = bio_list_pop(&bio_list);
@@ -1283,7 +1379,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1283 stripe = &rbio->bbio->stripes[i]; 1379 stripe = &rbio->bbio->stripes[i];
1284 stripe_start = stripe->physical; 1380 stripe_start = stripe->physical;
1285 if (physical >= stripe_start && 1381 if (physical >= stripe_start &&
1286 physical < stripe_start + rbio->stripe_len) { 1382 physical < stripe_start + rbio->stripe_len &&
1383 bio->bi_bdev == stripe->dev->bdev) {
1287 return i; 1384 return i;
1288 } 1385 }
1289 } 1386 }
@@ -1331,11 +1428,11 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1331 if (rbio->faila == -1) { 1428 if (rbio->faila == -1) {
1332 /* first failure on this rbio */ 1429 /* first failure on this rbio */
1333 rbio->faila = failed; 1430 rbio->faila = failed;
1334 atomic_inc(&rbio->bbio->error); 1431 atomic_inc(&rbio->error);
1335 } else if (rbio->failb == -1) { 1432 } else if (rbio->failb == -1) {
1336 /* second failure on this rbio */ 1433 /* second failure on this rbio */
1337 rbio->failb = failed; 1434 rbio->failb = failed;
1338 atomic_inc(&rbio->bbio->error); 1435 atomic_inc(&rbio->error);
1339 } else { 1436 } else {
1340 ret = -EIO; 1437 ret = -EIO;
1341 } 1438 }
@@ -1394,11 +1491,11 @@ static void raid_rmw_end_io(struct bio *bio, int err)
1394 1491
1395 bio_put(bio); 1492 bio_put(bio);
1396 1493
1397 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 1494 if (!atomic_dec_and_test(&rbio->stripes_pending))
1398 return; 1495 return;
1399 1496
1400 err = 0; 1497 err = 0;
1401 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 1498 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1402 goto cleanup; 1499 goto cleanup;
1403 1500
1404 /* 1501 /*
@@ -1439,7 +1536,6 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1439static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1536static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1440{ 1537{
1441 int bios_to_read = 0; 1538 int bios_to_read = 0;
1442 struct btrfs_bio *bbio = rbio->bbio;
1443 struct bio_list bio_list; 1539 struct bio_list bio_list;
1444 int ret; 1540 int ret;
1445 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 1541 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1455,7 +1551,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1455 1551
1456 index_rbio_pages(rbio); 1552 index_rbio_pages(rbio);
1457 1553
1458 atomic_set(&rbio->bbio->error, 0); 1554 atomic_set(&rbio->error, 0);
1459 /* 1555 /*
1460 * build a list of bios to read all the missing parts of this 1556 * build a list of bios to read all the missing parts of this
1461 * stripe 1557 * stripe
@@ -1503,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1503 * the bbio may be freed once we submit the last bio. Make sure 1599 * the bbio may be freed once we submit the last bio. Make sure
1504 * not to touch it after that 1600 * not to touch it after that
1505 */ 1601 */
1506 atomic_set(&bbio->stripes_pending, bios_to_read); 1602 atomic_set(&rbio->stripes_pending, bios_to_read);
1507 while (1) { 1603 while (1) {
1508 bio = bio_list_pop(&bio_list); 1604 bio = bio_list_pop(&bio_list);
1509 if (!bio) 1605 if (!bio)
@@ -1686,19 +1782,30 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1686 struct btrfs_raid_bio *rbio; 1782 struct btrfs_raid_bio *rbio;
1687 struct btrfs_plug_cb *plug = NULL; 1783 struct btrfs_plug_cb *plug = NULL;
1688 struct blk_plug_cb *cb; 1784 struct blk_plug_cb *cb;
1785 int ret;
1689 1786
1690 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 1787 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1691 if (IS_ERR(rbio)) 1788 if (IS_ERR(rbio)) {
1789 __free_bbio_and_raid_map(bbio, raid_map, 1);
1692 return PTR_ERR(rbio); 1790 return PTR_ERR(rbio);
1791 }
1693 bio_list_add(&rbio->bio_list, bio); 1792 bio_list_add(&rbio->bio_list, bio);
1694 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1793 rbio->bio_list_bytes = bio->bi_iter.bi_size;
1794 rbio->operation = BTRFS_RBIO_WRITE;
1795
1796 btrfs_bio_counter_inc_noblocked(root->fs_info);
1797 rbio->generic_bio_cnt = 1;
1695 1798
1696 /* 1799 /*
1697 * don't plug on full rbios, just get them out the door 1800 * don't plug on full rbios, just get them out the door
1698 * as quickly as we can 1801 * as quickly as we can
1699 */ 1802 */
1700 if (rbio_is_full(rbio)) 1803 if (rbio_is_full(rbio)) {
1701 return full_stripe_write(rbio); 1804 ret = full_stripe_write(rbio);
1805 if (ret)
1806 btrfs_bio_counter_dec(root->fs_info);
1807 return ret;
1808 }
1702 1809
1703 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, 1810 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1704 sizeof(*plug)); 1811 sizeof(*plug));
@@ -1709,10 +1816,13 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1709 INIT_LIST_HEAD(&plug->rbio_list); 1816 INIT_LIST_HEAD(&plug->rbio_list);
1710 } 1817 }
1711 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1818 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1819 ret = 0;
1712 } else { 1820 } else {
1713 return __raid56_parity_write(rbio); 1821 ret = __raid56_parity_write(rbio);
1822 if (ret)
1823 btrfs_bio_counter_dec(root->fs_info);
1714 } 1824 }
1715 return 0; 1825 return ret;
1716} 1826}
1717 1827
1718/* 1828/*
@@ -1730,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1730 int err; 1840 int err;
1731 int i; 1841 int i;
1732 1842
1733 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), 1843 pointers = kzalloc(rbio->real_stripes * sizeof(void *),
1734 GFP_NOFS); 1844 GFP_NOFS);
1735 if (!pointers) { 1845 if (!pointers) {
1736 err = -ENOMEM; 1846 err = -ENOMEM;
@@ -1740,7 +1850,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1740 faila = rbio->faila; 1850 faila = rbio->faila;
1741 failb = rbio->failb; 1851 failb = rbio->failb;
1742 1852
1743 if (rbio->read_rebuild) { 1853 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1744 spin_lock_irq(&rbio->bio_list_lock); 1854 spin_lock_irq(&rbio->bio_list_lock);
1745 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1855 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1746 spin_unlock_irq(&rbio->bio_list_lock); 1856 spin_unlock_irq(&rbio->bio_list_lock);
@@ -1749,15 +1859,23 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1749 index_rbio_pages(rbio); 1859 index_rbio_pages(rbio);
1750 1860
1751 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1861 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1862 /*
1863 * Now we just use bitmap to mark the horizontal stripes in
1864 * which we have data when doing parity scrub.
1865 */
1866 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1867 !test_bit(pagenr, rbio->dbitmap))
1868 continue;
1869
1752 /* setup our array of pointers with pages 1870 /* setup our array of pointers with pages
1753 * from each stripe 1871 * from each stripe
1754 */ 1872 */
1755 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1873 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1756 /* 1874 /*
1757 * if we're rebuilding a read, we have to use 1875 * if we're rebuilding a read, we have to use
1758 * pages from the bio list 1876 * pages from the bio list
1759 */ 1877 */
1760 if (rbio->read_rebuild && 1878 if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1761 (stripe == faila || stripe == failb)) { 1879 (stripe == faila || stripe == failb)) {
1762 page = page_in_rbio(rbio, stripe, pagenr, 0); 1880 page = page_in_rbio(rbio, stripe, pagenr, 0);
1763 } else { 1881 } else {
@@ -1767,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1767 } 1885 }
1768 1886
1769 /* all raid6 handling here */ 1887 /* all raid6 handling here */
1770 if (rbio->raid_map[rbio->bbio->num_stripes - 1] == 1888 if (rbio->raid_map[rbio->real_stripes - 1] ==
1771 RAID6_Q_STRIPE) { 1889 RAID6_Q_STRIPE) {
1772 1890
1773 /* 1891 /*
@@ -1817,10 +1935,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1817 } 1935 }
1818 1936
1819 if (rbio->raid_map[failb] == RAID5_P_STRIPE) { 1937 if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1820 raid6_datap_recov(rbio->bbio->num_stripes, 1938 raid6_datap_recov(rbio->real_stripes,
1821 PAGE_SIZE, faila, pointers); 1939 PAGE_SIZE, faila, pointers);
1822 } else { 1940 } else {
1823 raid6_2data_recov(rbio->bbio->num_stripes, 1941 raid6_2data_recov(rbio->real_stripes,
1824 PAGE_SIZE, faila, failb, 1942 PAGE_SIZE, faila, failb,
1825 pointers); 1943 pointers);
1826 } 1944 }
@@ -1850,7 +1968,7 @@ pstripe:
1850 * know they can be trusted. If this was a read reconstruction, 1968 * know they can be trusted. If this was a read reconstruction,
1851 * other endio functions will fiddle the uptodate bits 1969 * other endio functions will fiddle the uptodate bits
1852 */ 1970 */
1853 if (!rbio->read_rebuild) { 1971 if (rbio->operation == BTRFS_RBIO_WRITE) {
1854 for (i = 0; i < nr_pages; i++) { 1972 for (i = 0; i < nr_pages; i++) {
1855 if (faila != -1) { 1973 if (faila != -1) {
1856 page = rbio_stripe_page(rbio, faila, i); 1974 page = rbio_stripe_page(rbio, faila, i);
@@ -1862,12 +1980,12 @@ pstripe:
1862 } 1980 }
1863 } 1981 }
1864 } 1982 }
1865 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1983 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1866 /* 1984 /*
1867 * if we're rebuilding a read, we have to use 1985 * if we're rebuilding a read, we have to use
1868 * pages from the bio list 1986 * pages from the bio list
1869 */ 1987 */
1870 if (rbio->read_rebuild && 1988 if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1871 (stripe == faila || stripe == failb)) { 1989 (stripe == faila || stripe == failb)) {
1872 page = page_in_rbio(rbio, stripe, pagenr, 0); 1990 page = page_in_rbio(rbio, stripe, pagenr, 0);
1873 } else { 1991 } else {
@@ -1882,9 +2000,9 @@ cleanup:
1882 kfree(pointers); 2000 kfree(pointers);
1883 2001
1884cleanup_io: 2002cleanup_io:
1885 2003 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1886 if (rbio->read_rebuild) { 2004 if (err == 0 &&
1887 if (err == 0) 2005 !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
1888 cache_rbio_pages(rbio); 2006 cache_rbio_pages(rbio);
1889 else 2007 else
1890 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2008 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -1893,7 +2011,13 @@ cleanup_io:
1893 } else if (err == 0) { 2011 } else if (err == 0) {
1894 rbio->faila = -1; 2012 rbio->faila = -1;
1895 rbio->failb = -1; 2013 rbio->failb = -1;
1896 finish_rmw(rbio); 2014
2015 if (rbio->operation == BTRFS_RBIO_WRITE)
2016 finish_rmw(rbio);
2017 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2018 finish_parity_scrub(rbio, 0);
2019 else
2020 BUG();
1897 } else { 2021 } else {
1898 rbio_orig_end_io(rbio, err, 0); 2022 rbio_orig_end_io(rbio, err, 0);
1899 } 2023 }
@@ -1917,10 +2041,10 @@ static void raid_recover_end_io(struct bio *bio, int err)
1917 set_bio_pages_uptodate(bio); 2041 set_bio_pages_uptodate(bio);
1918 bio_put(bio); 2042 bio_put(bio);
1919 2043
1920 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 2044 if (!atomic_dec_and_test(&rbio->stripes_pending))
1921 return; 2045 return;
1922 2046
1923 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 2047 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1924 rbio_orig_end_io(rbio, -EIO, 0); 2048 rbio_orig_end_io(rbio, -EIO, 0);
1925 else 2049 else
1926 __raid_recover_end_io(rbio); 2050 __raid_recover_end_io(rbio);
@@ -1937,7 +2061,6 @@ static void raid_recover_end_io(struct bio *bio, int err)
1937static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2061static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1938{ 2062{
1939 int bios_to_read = 0; 2063 int bios_to_read = 0;
1940 struct btrfs_bio *bbio = rbio->bbio;
1941 struct bio_list bio_list; 2064 struct bio_list bio_list;
1942 int ret; 2065 int ret;
1943 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 2066 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
@@ -1951,16 +2074,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1951 if (ret) 2074 if (ret)
1952 goto cleanup; 2075 goto cleanup;
1953 2076
1954 atomic_set(&rbio->bbio->error, 0); 2077 atomic_set(&rbio->error, 0);
1955 2078
1956 /* 2079 /*
1957 * read everything that hasn't failed. Thanks to the 2080 * read everything that hasn't failed. Thanks to the
1958 * stripe cache, it is possible that some or all of these 2081 * stripe cache, it is possible that some or all of these
1959 * pages are going to be uptodate. 2082 * pages are going to be uptodate.
1960 */ 2083 */
1961 for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 2084 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1962 if (rbio->faila == stripe || rbio->failb == stripe) { 2085 if (rbio->faila == stripe || rbio->failb == stripe) {
1963 atomic_inc(&rbio->bbio->error); 2086 atomic_inc(&rbio->error);
1964 continue; 2087 continue;
1965 } 2088 }
1966 2089
@@ -1990,7 +2113,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1990 * were up to date, or we might have no bios to read because 2113 * were up to date, or we might have no bios to read because
1991 * the devices were gone. 2114 * the devices were gone.
1992 */ 2115 */
1993 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { 2116 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
1994 __raid_recover_end_io(rbio); 2117 __raid_recover_end_io(rbio);
1995 goto out; 2118 goto out;
1996 } else { 2119 } else {
@@ -2002,7 +2125,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2002 * the bbio may be freed once we submit the last bio. Make sure 2125 * the bbio may be freed once we submit the last bio. Make sure
2003 * not to touch it after that 2126 * not to touch it after that
2004 */ 2127 */
2005 atomic_set(&bbio->stripes_pending, bios_to_read); 2128 atomic_set(&rbio->stripes_pending, bios_to_read);
2006 while (1) { 2129 while (1) {
2007 bio = bio_list_pop(&bio_list); 2130 bio = bio_list_pop(&bio_list);
2008 if (!bio) 2131 if (!bio)
@@ -2021,7 +2144,7 @@ out:
2021 return 0; 2144 return 0;
2022 2145
2023cleanup: 2146cleanup:
2024 if (rbio->read_rebuild) 2147 if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
2025 rbio_orig_end_io(rbio, -EIO, 0); 2148 rbio_orig_end_io(rbio, -EIO, 0);
2026 return -EIO; 2149 return -EIO;
2027} 2150}
@@ -2034,34 +2157,42 @@ cleanup:
2034 */ 2157 */
2035int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 2158int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2036 struct btrfs_bio *bbio, u64 *raid_map, 2159 struct btrfs_bio *bbio, u64 *raid_map,
2037 u64 stripe_len, int mirror_num) 2160 u64 stripe_len, int mirror_num, int generic_io)
2038{ 2161{
2039 struct btrfs_raid_bio *rbio; 2162 struct btrfs_raid_bio *rbio;
2040 int ret; 2163 int ret;
2041 2164
2042 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 2165 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2043 if (IS_ERR(rbio)) 2166 if (IS_ERR(rbio)) {
2167 __free_bbio_and_raid_map(bbio, raid_map, generic_io);
2044 return PTR_ERR(rbio); 2168 return PTR_ERR(rbio);
2169 }
2045 2170
2046 rbio->read_rebuild = 1; 2171 rbio->operation = BTRFS_RBIO_READ_REBUILD;
2047 bio_list_add(&rbio->bio_list, bio); 2172 bio_list_add(&rbio->bio_list, bio);
2048 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2173 rbio->bio_list_bytes = bio->bi_iter.bi_size;
2049 2174
2050 rbio->faila = find_logical_bio_stripe(rbio, bio); 2175 rbio->faila = find_logical_bio_stripe(rbio, bio);
2051 if (rbio->faila == -1) { 2176 if (rbio->faila == -1) {
2052 BUG(); 2177 BUG();
2053 kfree(raid_map); 2178 __free_bbio_and_raid_map(bbio, raid_map, generic_io);
2054 kfree(bbio);
2055 kfree(rbio); 2179 kfree(rbio);
2056 return -EIO; 2180 return -EIO;
2057 } 2181 }
2058 2182
2183 if (generic_io) {
2184 btrfs_bio_counter_inc_noblocked(root->fs_info);
2185 rbio->generic_bio_cnt = 1;
2186 } else {
2187 set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
2188 }
2189
2059 /* 2190 /*
2060 * reconstruct from the q stripe if they are 2191 * reconstruct from the q stripe if they are
2061 * asking for mirror 3 2192 * asking for mirror 3
2062 */ 2193 */
2063 if (mirror_num == 3) 2194 if (mirror_num == 3)
2064 rbio->failb = bbio->num_stripes - 2; 2195 rbio->failb = rbio->real_stripes - 2;
2065 2196
2066 ret = lock_stripe_add(rbio); 2197 ret = lock_stripe_add(rbio);
2067 2198
@@ -2098,3 +2229,483 @@ static void read_rebuild_work(struct btrfs_work *work)
2098 rbio = container_of(work, struct btrfs_raid_bio, work); 2229 rbio = container_of(work, struct btrfs_raid_bio, work);
2099 __raid56_parity_recover(rbio); 2230 __raid56_parity_recover(rbio);
2100} 2231}
2232
2233/*
2234 * The following code is used to scrub/replace the parity stripe
2235 *
2236 * Note: We need make sure all the pages that add into the scrub/replace
2237 * raid bio are correct and not be changed during the scrub/replace. That
2238 * is those pages just hold metadata or file data with checksum.
2239 */
2240
2241struct btrfs_raid_bio *
2242raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
2243 struct btrfs_bio *bbio, u64 *raid_map,
2244 u64 stripe_len, struct btrfs_device *scrub_dev,
2245 unsigned long *dbitmap, int stripe_nsectors)
2246{
2247 struct btrfs_raid_bio *rbio;
2248 int i;
2249
2250 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2251 if (IS_ERR(rbio))
2252 return NULL;
2253 bio_list_add(&rbio->bio_list, bio);
2254 /*
2255 * This is a special bio which is used to hold the completion handler
2256 * and make the scrub rbio is similar to the other types
2257 */
2258 ASSERT(!bio->bi_iter.bi_size);
2259 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2260
2261 for (i = 0; i < rbio->real_stripes; i++) {
2262 if (bbio->stripes[i].dev == scrub_dev) {
2263 rbio->scrubp = i;
2264 break;
2265 }
2266 }
2267
2268 /* Now we just support the sectorsize equals to page size */
2269 ASSERT(root->sectorsize == PAGE_SIZE);
2270 ASSERT(rbio->stripe_npages == stripe_nsectors);
2271 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2272
2273 return rbio;
2274}
2275
2276void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
2277 struct page *page, u64 logical)
2278{
2279 int stripe_offset;
2280 int index;
2281
2282 ASSERT(logical >= rbio->raid_map[0]);
2283 ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
2284 rbio->stripe_len * rbio->nr_data);
2285 stripe_offset = (int)(logical - rbio->raid_map[0]);
2286 index = stripe_offset >> PAGE_CACHE_SHIFT;
2287 rbio->bio_pages[index] = page;
2288}
2289
2290/*
2291 * We just scrub the parity that we have correct data on the same horizontal,
2292 * so we needn't allocate all pages for all the stripes.
2293 */
2294static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2295{
2296 int i;
2297 int bit;
2298 int index;
2299 struct page *page;
2300
2301 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2302 for (i = 0; i < rbio->real_stripes; i++) {
2303 index = i * rbio->stripe_npages + bit;
2304 if (rbio->stripe_pages[index])
2305 continue;
2306
2307 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2308 if (!page)
2309 return -ENOMEM;
2310 rbio->stripe_pages[index] = page;
2311 ClearPageUptodate(page);
2312 }
2313 }
2314 return 0;
2315}
2316
2317/*
2318 * end io function used by finish_rmw. When we finally
2319 * get here, we've written a full stripe
2320 */
2321static void raid_write_parity_end_io(struct bio *bio, int err)
2322{
2323 struct btrfs_raid_bio *rbio = bio->bi_private;
2324
2325 if (err)
2326 fail_bio_stripe(rbio, bio);
2327
2328 bio_put(bio);
2329
2330 if (!atomic_dec_and_test(&rbio->stripes_pending))
2331 return;
2332
2333 err = 0;
2334
2335 if (atomic_read(&rbio->error))
2336 err = -EIO;
2337
2338 rbio_orig_end_io(rbio, err, 0);
2339}
2340
2341static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2342 int need_check)
2343{
2344 struct btrfs_bio *bbio = rbio->bbio;
2345 void *pointers[rbio->real_stripes];
2346 DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
2347 int nr_data = rbio->nr_data;
2348 int stripe;
2349 int pagenr;
2350 int p_stripe = -1;
2351 int q_stripe = -1;
2352 struct page *p_page = NULL;
2353 struct page *q_page = NULL;
2354 struct bio_list bio_list;
2355 struct bio *bio;
2356 int is_replace = 0;
2357 int ret;
2358
2359 bio_list_init(&bio_list);
2360
2361 if (rbio->real_stripes - rbio->nr_data == 1) {
2362 p_stripe = rbio->real_stripes - 1;
2363 } else if (rbio->real_stripes - rbio->nr_data == 2) {
2364 p_stripe = rbio->real_stripes - 2;
2365 q_stripe = rbio->real_stripes - 1;
2366 } else {
2367 BUG();
2368 }
2369
2370 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
2371 is_replace = 1;
2372 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2373 }
2374
2375 /*
2376 * Because the higher layers(scrubber) are unlikely to
2377 * use this area of the disk again soon, so don't cache
2378 * it.
2379 */
2380 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2381
2382 if (!need_check)
2383 goto writeback;
2384
2385 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2386 if (!p_page)
2387 goto cleanup;
2388 SetPageUptodate(p_page);
2389
2390 if (q_stripe != -1) {
2391 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2392 if (!q_page) {
2393 __free_page(p_page);
2394 goto cleanup;
2395 }
2396 SetPageUptodate(q_page);
2397 }
2398
2399 atomic_set(&rbio->error, 0);
2400
2401 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2402 struct page *p;
2403 void *parity;
2404 /* first collect one page from each data stripe */
2405 for (stripe = 0; stripe < nr_data; stripe++) {
2406 p = page_in_rbio(rbio, stripe, pagenr, 0);
2407 pointers[stripe] = kmap(p);
2408 }
2409
2410 /* then add the parity stripe */
2411 pointers[stripe++] = kmap(p_page);
2412
2413 if (q_stripe != -1) {
2414
2415 /*
2416 * raid6, add the qstripe and call the
2417 * library function to fill in our p/q
2418 */
2419 pointers[stripe++] = kmap(q_page);
2420
2421 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2422 pointers);
2423 } else {
2424 /* raid5 */
2425 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
2426 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
2427 }
2428
2429 /* Check scrubbing pairty and repair it */
2430 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2431 parity = kmap(p);
2432 if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
2433 memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
2434 else
2435 /* Parity is right, needn't writeback */
2436 bitmap_clear(rbio->dbitmap, pagenr, 1);
2437 kunmap(p);
2438
2439 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
2440 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
2441 }
2442
2443 __free_page(p_page);
2444 if (q_page)
2445 __free_page(q_page);
2446
2447writeback:
2448 /*
2449 * time to start writing. Make bios for everything from the
2450 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2451 * everything else.
2452 */
2453 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2454 struct page *page;
2455
2456 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2457 ret = rbio_add_io_page(rbio, &bio_list,
2458 page, rbio->scrubp, pagenr, rbio->stripe_len);
2459 if (ret)
2460 goto cleanup;
2461 }
2462
2463 if (!is_replace)
2464 goto submit_write;
2465
2466 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2467 struct page *page;
2468
2469 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2470 ret = rbio_add_io_page(rbio, &bio_list, page,
2471 bbio->tgtdev_map[rbio->scrubp],
2472 pagenr, rbio->stripe_len);
2473 if (ret)
2474 goto cleanup;
2475 }
2476
2477submit_write:
2478 nr_data = bio_list_size(&bio_list);
2479 if (!nr_data) {
2480 /* Every parity is right */
2481 rbio_orig_end_io(rbio, 0, 0);
2482 return;
2483 }
2484
2485 atomic_set(&rbio->stripes_pending, nr_data);
2486
2487 while (1) {
2488 bio = bio_list_pop(&bio_list);
2489 if (!bio)
2490 break;
2491
2492 bio->bi_private = rbio;
2493 bio->bi_end_io = raid_write_parity_end_io;
2494 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2495 submit_bio(WRITE, bio);
2496 }
2497 return;
2498
2499cleanup:
2500 rbio_orig_end_io(rbio, -EIO, 0);
2501}
2502
2503static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2504{
2505 if (stripe >= 0 && stripe < rbio->nr_data)
2506 return 1;
2507 return 0;
2508}
2509
2510/*
2511 * While we're doing the parity check and repair, we could have errors
2512 * in reading pages off the disk. This checks for errors and if we're
2513 * not able to read the page it'll trigger parity reconstruction. The
2514 * parity scrub will be finished after we've reconstructed the failed
2515 * stripes
2516 */
2517static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2518{
2519 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2520 goto cleanup;
2521
2522 if (rbio->faila >= 0 || rbio->failb >= 0) {
2523 int dfail = 0, failp = -1;
2524
2525 if (is_data_stripe(rbio, rbio->faila))
2526 dfail++;
2527 else if (is_parity_stripe(rbio->faila))
2528 failp = rbio->faila;
2529
2530 if (is_data_stripe(rbio, rbio->failb))
2531 dfail++;
2532 else if (is_parity_stripe(rbio->failb))
2533 failp = rbio->failb;
2534
2535 /*
2536 * Because we can not use a scrubbing parity to repair
2537 * the data, so the capability of the repair is declined.
2538 * (In the case of RAID5, we can not repair anything)
2539 */
2540 if (dfail > rbio->bbio->max_errors - 1)
2541 goto cleanup;
2542
2543 /*
2544 * If all data is good, only parity is correctly, just
2545 * repair the parity.
2546 */
2547 if (dfail == 0) {
2548 finish_parity_scrub(rbio, 0);
2549 return;
2550 }
2551
2552 /*
2553 * Here means we got one corrupted data stripe and one
2554 * corrupted parity on RAID6, if the corrupted parity
2555 * is scrubbing parity, luckly, use the other one to repair
2556 * the data, or we can not repair the data stripe.
2557 */
2558 if (failp != rbio->scrubp)
2559 goto cleanup;
2560
2561 __raid_recover_end_io(rbio);
2562 } else {
2563 finish_parity_scrub(rbio, 1);
2564 }
2565 return;
2566
2567cleanup:
2568 rbio_orig_end_io(rbio, -EIO, 0);
2569}
2570
2571/*
2572 * end io for the read phase of the rmw cycle. All the bios here are physical
2573 * stripe bios we've read from the disk so we can recalculate the parity of the
2574 * stripe.
2575 *
2576 * This will usually kick off finish_rmw once all the bios are read in, but it
2577 * may trigger parity reconstruction if we had any errors along the way
2578 */
2579static void raid56_parity_scrub_end_io(struct bio *bio, int err)
2580{
2581 struct btrfs_raid_bio *rbio = bio->bi_private;
2582
2583 if (err)
2584 fail_bio_stripe(rbio, bio);
2585 else
2586 set_bio_pages_uptodate(bio);
2587
2588 bio_put(bio);
2589
2590 if (!atomic_dec_and_test(&rbio->stripes_pending))
2591 return;
2592
2593 /*
2594 * this will normally call finish_rmw to start our write
2595 * but if there are any failed stripes we'll reconstruct
2596 * from parity first
2597 */
2598 validate_rbio_for_parity_scrub(rbio);
2599}
2600
2601static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2602{
2603 int bios_to_read = 0;
2604 struct bio_list bio_list;
2605 int ret;
2606 int pagenr;
2607 int stripe;
2608 struct bio *bio;
2609
2610 ret = alloc_rbio_essential_pages(rbio);
2611 if (ret)
2612 goto cleanup;
2613
2614 bio_list_init(&bio_list);
2615
2616 atomic_set(&rbio->error, 0);
2617 /*
2618 * build a list of bios to read all the missing parts of this
2619 * stripe
2620 */
2621 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2622 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2623 struct page *page;
2624 /*
2625 * we want to find all the pages missing from
2626 * the rbio and read them from the disk. If
2627 * page_in_rbio finds a page in the bio list
2628 * we don't need to read it off the stripe.
2629 */
2630 page = page_in_rbio(rbio, stripe, pagenr, 1);
2631 if (page)
2632 continue;
2633
2634 page = rbio_stripe_page(rbio, stripe, pagenr);
2635 /*
2636 * the bio cache may have handed us an uptodate
2637 * page. If so, be happy and use it
2638 */
2639 if (PageUptodate(page))
2640 continue;
2641
2642 ret = rbio_add_io_page(rbio, &bio_list, page,
2643 stripe, pagenr, rbio->stripe_len);
2644 if (ret)
2645 goto cleanup;
2646 }
2647 }
2648
2649 bios_to_read = bio_list_size(&bio_list);
2650 if (!bios_to_read) {
2651 /*
2652 * this can happen if others have merged with
2653 * us, it means there is nothing left to read.
2654 * But if there are missing devices it may not be
2655 * safe to do the full stripe write yet.
2656 */
2657 goto finish;
2658 }
2659
2660 /*
2661 * the bbio may be freed once we submit the last bio. Make sure
2662 * not to touch it after that
2663 */
2664 atomic_set(&rbio->stripes_pending, bios_to_read);
2665 while (1) {
2666 bio = bio_list_pop(&bio_list);
2667 if (!bio)
2668 break;
2669
2670 bio->bi_private = rbio;
2671 bio->bi_end_io = raid56_parity_scrub_end_io;
2672
2673 btrfs_bio_wq_end_io(rbio->fs_info, bio,
2674 BTRFS_WQ_ENDIO_RAID56);
2675
2676 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2677 submit_bio(READ, bio);
2678 }
2679 /* the actual write will happen once the reads are done */
2680 return;
2681
2682cleanup:
2683 rbio_orig_end_io(rbio, -EIO, 0);
2684 return;
2685
2686finish:
2687 validate_rbio_for_parity_scrub(rbio);
2688}
2689
2690static void scrub_parity_work(struct btrfs_work *work)
2691{
2692 struct btrfs_raid_bio *rbio;
2693
2694 rbio = container_of(work, struct btrfs_raid_bio, work);
2695 raid56_parity_scrub_stripe(rbio);
2696}
2697
2698static void async_scrub_parity(struct btrfs_raid_bio *rbio)
2699{
2700 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
2701 scrub_parity_work, NULL, NULL);
2702
2703 btrfs_queue_work(rbio->fs_info->rmw_workers,
2704 &rbio->work);
2705}
2706
2707void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2708{
2709 if (!lock_stripe_add(rbio))
2710 async_scrub_parity(rbio);
2711}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index ea5d73bfdfbe..31d4a157b5e3 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -39,13 +39,25 @@ static inline int nr_data_stripes(struct map_lookup *map)
39#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ 39#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
40 ((x) == RAID6_Q_STRIPE)) 40 ((x) == RAID6_Q_STRIPE))
41 41
42struct btrfs_raid_bio;
43struct btrfs_device;
44
42int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 45int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
43 struct btrfs_bio *bbio, u64 *raid_map, 46 struct btrfs_bio *bbio, u64 *raid_map,
44 u64 stripe_len, int mirror_num); 47 u64 stripe_len, int mirror_num, int generic_io);
45int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 48int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
46 struct btrfs_bio *bbio, u64 *raid_map, 49 struct btrfs_bio *bbio, u64 *raid_map,
47 u64 stripe_len); 50 u64 stripe_len);
48 51
52struct btrfs_raid_bio *
53raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
54 struct btrfs_bio *bbio, u64 *raid_map,
55 u64 stripe_len, struct btrfs_device *scrub_dev,
56 unsigned long *dbitmap, int stripe_nsectors);
57void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
58 struct page *page, u64 logical);
59void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
60
49int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); 61int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
50void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); 62void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
51#endif 63#endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efa083113827..f2bb13a23f86 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -63,10 +63,18 @@ struct scrub_ctx;
63 */ 63 */
64#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 64#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
65 65
66struct scrub_recover {
67 atomic_t refs;
68 struct btrfs_bio *bbio;
69 u64 *raid_map;
70 u64 map_length;
71};
72
66struct scrub_page { 73struct scrub_page {
67 struct scrub_block *sblock; 74 struct scrub_block *sblock;
68 struct page *page; 75 struct page *page;
69 struct btrfs_device *dev; 76 struct btrfs_device *dev;
77 struct list_head list;
70 u64 flags; /* extent flags */ 78 u64 flags; /* extent flags */
71 u64 generation; 79 u64 generation;
72 u64 logical; 80 u64 logical;
@@ -79,6 +87,8 @@ struct scrub_page {
79 unsigned int io_error:1; 87 unsigned int io_error:1;
80 }; 88 };
81 u8 csum[BTRFS_CSUM_SIZE]; 89 u8 csum[BTRFS_CSUM_SIZE];
90
91 struct scrub_recover *recover;
82}; 92};
83 93
84struct scrub_bio { 94struct scrub_bio {
@@ -105,14 +115,52 @@ struct scrub_block {
105 atomic_t outstanding_pages; 115 atomic_t outstanding_pages;
106 atomic_t ref_count; /* free mem on transition to zero */ 116 atomic_t ref_count; /* free mem on transition to zero */
107 struct scrub_ctx *sctx; 117 struct scrub_ctx *sctx;
118 struct scrub_parity *sparity;
108 struct { 119 struct {
109 unsigned int header_error:1; 120 unsigned int header_error:1;
110 unsigned int checksum_error:1; 121 unsigned int checksum_error:1;
111 unsigned int no_io_error_seen:1; 122 unsigned int no_io_error_seen:1;
112 unsigned int generation_error:1; /* also sets header_error */ 123 unsigned int generation_error:1; /* also sets header_error */
124
125 /* The following is for the data used to check parity */
126 /* It is for the data with checksum */
127 unsigned int data_corrected:1;
113 }; 128 };
114}; 129};
115 130
131/* Used for the chunks with parity stripe such RAID5/6 */
132struct scrub_parity {
133 struct scrub_ctx *sctx;
134
135 struct btrfs_device *scrub_dev;
136
137 u64 logic_start;
138
139 u64 logic_end;
140
141 int nsectors;
142
143 int stripe_len;
144
145 atomic_t ref_count;
146
147 struct list_head spages;
148
149 /* Work of parity check and repair */
150 struct btrfs_work work;
151
152 /* Mark the parity blocks which have data */
153 unsigned long *dbitmap;
154
155 /*
156 * Mark the parity blocks which have data, but errors happen when
157 * read data or check data
158 */
159 unsigned long *ebitmap;
160
161 unsigned long bitmap[0];
162};
163
116struct scrub_wr_ctx { 164struct scrub_wr_ctx {
117 struct scrub_bio *wr_curr_bio; 165 struct scrub_bio *wr_curr_bio;
118 struct btrfs_device *tgtdev; 166 struct btrfs_device *tgtdev;
@@ -196,7 +244,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
196static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 244static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
197 struct scrub_block *sblock, int is_metadata, 245 struct scrub_block *sblock, int is_metadata,
198 int have_csum, u8 *csum, u64 generation, 246 int have_csum, u8 *csum, u64 generation,
199 u16 csum_size); 247 u16 csum_size, int retry_failed_mirror);
200static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 248static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
201 struct scrub_block *sblock, 249 struct scrub_block *sblock,
202 int is_metadata, int have_csum, 250 int is_metadata, int have_csum,
@@ -218,6 +266,8 @@ static void scrub_block_get(struct scrub_block *sblock);
218static void scrub_block_put(struct scrub_block *sblock); 266static void scrub_block_put(struct scrub_block *sblock);
219static void scrub_page_get(struct scrub_page *spage); 267static void scrub_page_get(struct scrub_page *spage);
220static void scrub_page_put(struct scrub_page *spage); 268static void scrub_page_put(struct scrub_page *spage);
269static void scrub_parity_get(struct scrub_parity *sparity);
270static void scrub_parity_put(struct scrub_parity *sparity);
221static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 271static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
222 struct scrub_page *spage); 272 struct scrub_page *spage);
223static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 273static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -790,6 +840,20 @@ out:
790 scrub_pending_trans_workers_dec(sctx); 840 scrub_pending_trans_workers_dec(sctx);
791} 841}
792 842
843static inline void scrub_get_recover(struct scrub_recover *recover)
844{
845 atomic_inc(&recover->refs);
846}
847
848static inline void scrub_put_recover(struct scrub_recover *recover)
849{
850 if (atomic_dec_and_test(&recover->refs)) {
851 kfree(recover->bbio);
852 kfree(recover->raid_map);
853 kfree(recover);
854 }
855}
856
793/* 857/*
794 * scrub_handle_errored_block gets called when either verification of the 858 * scrub_handle_errored_block gets called when either verification of the
795 * pages failed or the bio failed to read, e.g. with EIO. In the latter 859 * pages failed or the bio failed to read, e.g. with EIO. In the latter
@@ -906,7 +970,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
906 970
907 /* build and submit the bios for the failed mirror, check checksums */ 971 /* build and submit the bios for the failed mirror, check checksums */
908 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 972 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
909 csum, generation, sctx->csum_size); 973 csum, generation, sctx->csum_size, 1);
910 974
911 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 975 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
912 sblock_bad->no_io_error_seen) { 976 sblock_bad->no_io_error_seen) {
@@ -920,6 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
920 */ 984 */
921 spin_lock(&sctx->stat_lock); 985 spin_lock(&sctx->stat_lock);
922 sctx->stat.unverified_errors++; 986 sctx->stat.unverified_errors++;
987 sblock_to_check->data_corrected = 1;
923 spin_unlock(&sctx->stat_lock); 988 spin_unlock(&sctx->stat_lock);
924 989
925 if (sctx->is_dev_replace) 990 if (sctx->is_dev_replace)
@@ -1019,7 +1084,7 @@ nodatasum_case:
1019 /* build and submit the bios, check checksums */ 1084 /* build and submit the bios, check checksums */
1020 scrub_recheck_block(fs_info, sblock_other, is_metadata, 1085 scrub_recheck_block(fs_info, sblock_other, is_metadata,
1021 have_csum, csum, generation, 1086 have_csum, csum, generation,
1022 sctx->csum_size); 1087 sctx->csum_size, 0);
1023 1088
1024 if (!sblock_other->header_error && 1089 if (!sblock_other->header_error &&
1025 !sblock_other->checksum_error && 1090 !sblock_other->checksum_error &&
@@ -1169,7 +1234,7 @@ nodatasum_case:
1169 */ 1234 */
1170 scrub_recheck_block(fs_info, sblock_bad, 1235 scrub_recheck_block(fs_info, sblock_bad,
1171 is_metadata, have_csum, csum, 1236 is_metadata, have_csum, csum,
1172 generation, sctx->csum_size); 1237 generation, sctx->csum_size, 1);
1173 if (!sblock_bad->header_error && 1238 if (!sblock_bad->header_error &&
1174 !sblock_bad->checksum_error && 1239 !sblock_bad->checksum_error &&
1175 sblock_bad->no_io_error_seen) 1240 sblock_bad->no_io_error_seen)
@@ -1180,6 +1245,7 @@ nodatasum_case:
1180corrected_error: 1245corrected_error:
1181 spin_lock(&sctx->stat_lock); 1246 spin_lock(&sctx->stat_lock);
1182 sctx->stat.corrected_errors++; 1247 sctx->stat.corrected_errors++;
1248 sblock_to_check->data_corrected = 1;
1183 spin_unlock(&sctx->stat_lock); 1249 spin_unlock(&sctx->stat_lock);
1184 printk_ratelimited_in_rcu(KERN_ERR 1250 printk_ratelimited_in_rcu(KERN_ERR
1185 "BTRFS: fixed up error at logical %llu on dev %s\n", 1251 "BTRFS: fixed up error at logical %llu on dev %s\n",
@@ -1201,11 +1267,18 @@ out:
1201 mirror_index++) { 1267 mirror_index++) {
1202 struct scrub_block *sblock = sblocks_for_recheck + 1268 struct scrub_block *sblock = sblocks_for_recheck +
1203 mirror_index; 1269 mirror_index;
1270 struct scrub_recover *recover;
1204 int page_index; 1271 int page_index;
1205 1272
1206 for (page_index = 0; page_index < sblock->page_count; 1273 for (page_index = 0; page_index < sblock->page_count;
1207 page_index++) { 1274 page_index++) {
1208 sblock->pagev[page_index]->sblock = NULL; 1275 sblock->pagev[page_index]->sblock = NULL;
1276 recover = sblock->pagev[page_index]->recover;
1277 if (recover) {
1278 scrub_put_recover(recover);
1279 sblock->pagev[page_index]->recover =
1280 NULL;
1281 }
1209 scrub_page_put(sblock->pagev[page_index]); 1282 scrub_page_put(sblock->pagev[page_index]);
1210 } 1283 }
1211 } 1284 }
@@ -1215,14 +1288,63 @@ out:
1215 return 0; 1288 return 0;
1216} 1289}
1217 1290
1291static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
1292{
1293 if (raid_map) {
1294 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
1295 return 3;
1296 else
1297 return 2;
1298 } else {
1299 return (int)bbio->num_stripes;
1300 }
1301}
1302
1303static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
1304 u64 mapped_length,
1305 int nstripes, int mirror,
1306 int *stripe_index,
1307 u64 *stripe_offset)
1308{
1309 int i;
1310
1311 if (raid_map) {
1312 /* RAID5/6 */
1313 for (i = 0; i < nstripes; i++) {
1314 if (raid_map[i] == RAID6_Q_STRIPE ||
1315 raid_map[i] == RAID5_P_STRIPE)
1316 continue;
1317
1318 if (logical >= raid_map[i] &&
1319 logical < raid_map[i] + mapped_length)
1320 break;
1321 }
1322
1323 *stripe_index = i;
1324 *stripe_offset = logical - raid_map[i];
1325 } else {
1326 /* The other RAID type */
1327 *stripe_index = mirror;
1328 *stripe_offset = 0;
1329 }
1330}
1331
1218static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 1332static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1219 struct btrfs_fs_info *fs_info, 1333 struct btrfs_fs_info *fs_info,
1220 struct scrub_block *original_sblock, 1334 struct scrub_block *original_sblock,
1221 u64 length, u64 logical, 1335 u64 length, u64 logical,
1222 struct scrub_block *sblocks_for_recheck) 1336 struct scrub_block *sblocks_for_recheck)
1223{ 1337{
1338 struct scrub_recover *recover;
1339 struct btrfs_bio *bbio;
1340 u64 *raid_map;
1341 u64 sublen;
1342 u64 mapped_length;
1343 u64 stripe_offset;
1344 int stripe_index;
1224 int page_index; 1345 int page_index;
1225 int mirror_index; 1346 int mirror_index;
1347 int nmirrors;
1226 int ret; 1348 int ret;
1227 1349
1228 /* 1350 /*
@@ -1233,23 +1355,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1233 1355
1234 page_index = 0; 1356 page_index = 0;
1235 while (length > 0) { 1357 while (length > 0) {
1236 u64 sublen = min_t(u64, length, PAGE_SIZE); 1358 sublen = min_t(u64, length, PAGE_SIZE);
1237 u64 mapped_length = sublen; 1359 mapped_length = sublen;
1238 struct btrfs_bio *bbio = NULL; 1360 bbio = NULL;
1361 raid_map = NULL;
1239 1362
1240 /* 1363 /*
1241 * with a length of PAGE_SIZE, each returned stripe 1364 * with a length of PAGE_SIZE, each returned stripe
1242 * represents one mirror 1365 * represents one mirror
1243 */ 1366 */
1244 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, 1367 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
1245 &mapped_length, &bbio, 0); 1368 &mapped_length, &bbio, 0, &raid_map);
1246 if (ret || !bbio || mapped_length < sublen) { 1369 if (ret || !bbio || mapped_length < sublen) {
1247 kfree(bbio); 1370 kfree(bbio);
1371 kfree(raid_map);
1248 return -EIO; 1372 return -EIO;
1249 } 1373 }
1250 1374
1375 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1376 if (!recover) {
1377 kfree(bbio);
1378 kfree(raid_map);
1379 return -ENOMEM;
1380 }
1381
1382 atomic_set(&recover->refs, 1);
1383 recover->bbio = bbio;
1384 recover->raid_map = raid_map;
1385 recover->map_length = mapped_length;
1386
1251 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); 1387 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1252 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1388
1389 nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
1390 for (mirror_index = 0; mirror_index < nmirrors;
1253 mirror_index++) { 1391 mirror_index++) {
1254 struct scrub_block *sblock; 1392 struct scrub_block *sblock;
1255 struct scrub_page *page; 1393 struct scrub_page *page;
@@ -1265,26 +1403,38 @@ leave_nomem:
1265 spin_lock(&sctx->stat_lock); 1403 spin_lock(&sctx->stat_lock);
1266 sctx->stat.malloc_errors++; 1404 sctx->stat.malloc_errors++;
1267 spin_unlock(&sctx->stat_lock); 1405 spin_unlock(&sctx->stat_lock);
1268 kfree(bbio); 1406 scrub_put_recover(recover);
1269 return -ENOMEM; 1407 return -ENOMEM;
1270 } 1408 }
1271 scrub_page_get(page); 1409 scrub_page_get(page);
1272 sblock->pagev[page_index] = page; 1410 sblock->pagev[page_index] = page;
1273 page->logical = logical; 1411 page->logical = logical;
1274 page->physical = bbio->stripes[mirror_index].physical; 1412
1413 scrub_stripe_index_and_offset(logical, raid_map,
1414 mapped_length,
1415 bbio->num_stripes,
1416 mirror_index,
1417 &stripe_index,
1418 &stripe_offset);
1419 page->physical = bbio->stripes[stripe_index].physical +
1420 stripe_offset;
1421 page->dev = bbio->stripes[stripe_index].dev;
1422
1275 BUG_ON(page_index >= original_sblock->page_count); 1423 BUG_ON(page_index >= original_sblock->page_count);
1276 page->physical_for_dev_replace = 1424 page->physical_for_dev_replace =
1277 original_sblock->pagev[page_index]-> 1425 original_sblock->pagev[page_index]->
1278 physical_for_dev_replace; 1426 physical_for_dev_replace;
1279 /* for missing devices, dev->bdev is NULL */ 1427 /* for missing devices, dev->bdev is NULL */
1280 page->dev = bbio->stripes[mirror_index].dev;
1281 page->mirror_num = mirror_index + 1; 1428 page->mirror_num = mirror_index + 1;
1282 sblock->page_count++; 1429 sblock->page_count++;
1283 page->page = alloc_page(GFP_NOFS); 1430 page->page = alloc_page(GFP_NOFS);
1284 if (!page->page) 1431 if (!page->page)
1285 goto leave_nomem; 1432 goto leave_nomem;
1433
1434 scrub_get_recover(recover);
1435 page->recover = recover;
1286 } 1436 }
1287 kfree(bbio); 1437 scrub_put_recover(recover);
1288 length -= sublen; 1438 length -= sublen;
1289 logical += sublen; 1439 logical += sublen;
1290 page_index++; 1440 page_index++;
@@ -1293,6 +1443,51 @@ leave_nomem:
1293 return 0; 1443 return 0;
1294} 1444}
1295 1445
1446struct scrub_bio_ret {
1447 struct completion event;
1448 int error;
1449};
1450
1451static void scrub_bio_wait_endio(struct bio *bio, int error)
1452{
1453 struct scrub_bio_ret *ret = bio->bi_private;
1454
1455 ret->error = error;
1456 complete(&ret->event);
1457}
1458
1459static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1460{
1461 return page->recover && page->recover->raid_map;
1462}
1463
1464static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1465 struct bio *bio,
1466 struct scrub_page *page)
1467{
1468 struct scrub_bio_ret done;
1469 int ret;
1470
1471 init_completion(&done.event);
1472 done.error = 0;
1473 bio->bi_iter.bi_sector = page->logical >> 9;
1474 bio->bi_private = &done;
1475 bio->bi_end_io = scrub_bio_wait_endio;
1476
1477 ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
1478 page->recover->raid_map,
1479 page->recover->map_length,
1480 page->mirror_num, 0);
1481 if (ret)
1482 return ret;
1483
1484 wait_for_completion(&done.event);
1485 if (done.error)
1486 return -EIO;
1487
1488 return 0;
1489}
1490
1296/* 1491/*
1297 * this function will check the on disk data for checksum errors, header 1492 * this function will check the on disk data for checksum errors, header
1298 * errors and read I/O errors. If any I/O errors happen, the exact pages 1493 * errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1303,7 +1498,7 @@ leave_nomem:
1303static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 1498static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1304 struct scrub_block *sblock, int is_metadata, 1499 struct scrub_block *sblock, int is_metadata,
1305 int have_csum, u8 *csum, u64 generation, 1500 int have_csum, u8 *csum, u64 generation,
1306 u16 csum_size) 1501 u16 csum_size, int retry_failed_mirror)
1307{ 1502{
1308 int page_num; 1503 int page_num;
1309 1504
@@ -1329,11 +1524,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1329 continue; 1524 continue;
1330 } 1525 }
1331 bio->bi_bdev = page->dev->bdev; 1526 bio->bi_bdev = page->dev->bdev;
1332 bio->bi_iter.bi_sector = page->physical >> 9;
1333 1527
1334 bio_add_page(bio, page->page, PAGE_SIZE, 0); 1528 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1335 if (btrfsic_submit_bio_wait(READ, bio)) 1529 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1336 sblock->no_io_error_seen = 0; 1530 if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
1531 sblock->no_io_error_seen = 0;
1532 } else {
1533 bio->bi_iter.bi_sector = page->physical >> 9;
1534
1535 if (btrfsic_submit_bio_wait(READ, bio))
1536 sblock->no_io_error_seen = 0;
1537 }
1337 1538
1338 bio_put(bio); 1539 bio_put(bio);
1339 } 1540 }
@@ -1486,6 +1687,13 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1486{ 1687{
1487 int page_num; 1688 int page_num;
1488 1689
1690 /*
1691 * This block is used for the check of the parity on the source device,
1692 * so the data needn't be written into the destination device.
1693 */
1694 if (sblock->sparity)
1695 return;
1696
1489 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1697 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1490 int ret; 1698 int ret;
1491 1699
@@ -1867,6 +2075,9 @@ static void scrub_block_put(struct scrub_block *sblock)
1867 if (atomic_dec_and_test(&sblock->ref_count)) { 2075 if (atomic_dec_and_test(&sblock->ref_count)) {
1868 int i; 2076 int i;
1869 2077
2078 if (sblock->sparity)
2079 scrub_parity_put(sblock->sparity);
2080
1870 for (i = 0; i < sblock->page_count; i++) 2081 for (i = 0; i < sblock->page_count; i++)
1871 scrub_page_put(sblock->pagev[i]); 2082 scrub_page_put(sblock->pagev[i]);
1872 kfree(sblock); 2083 kfree(sblock);
@@ -2124,9 +2335,51 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
2124 scrub_pending_bio_dec(sctx); 2335 scrub_pending_bio_dec(sctx);
2125} 2336}
2126 2337
2338static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2339 unsigned long *bitmap,
2340 u64 start, u64 len)
2341{
2342 int offset;
2343 int nsectors;
2344 int sectorsize = sparity->sctx->dev_root->sectorsize;
2345
2346 if (len >= sparity->stripe_len) {
2347 bitmap_set(bitmap, 0, sparity->nsectors);
2348 return;
2349 }
2350
2351 start -= sparity->logic_start;
2352 offset = (int)do_div(start, sparity->stripe_len);
2353 offset /= sectorsize;
2354 nsectors = (int)len / sectorsize;
2355
2356 if (offset + nsectors <= sparity->nsectors) {
2357 bitmap_set(bitmap, offset, nsectors);
2358 return;
2359 }
2360
2361 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2362 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2363}
2364
2365static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2366 u64 start, u64 len)
2367{
2368 __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2369}
2370
2371static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2372 u64 start, u64 len)
2373{
2374 __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2375}
2376
2127static void scrub_block_complete(struct scrub_block *sblock) 2377static void scrub_block_complete(struct scrub_block *sblock)
2128{ 2378{
2379 int corrupted = 0;
2380
2129 if (!sblock->no_io_error_seen) { 2381 if (!sblock->no_io_error_seen) {
2382 corrupted = 1;
2130 scrub_handle_errored_block(sblock); 2383 scrub_handle_errored_block(sblock);
2131 } else { 2384 } else {
2132 /* 2385 /*
@@ -2134,9 +2387,19 @@ static void scrub_block_complete(struct scrub_block *sblock)
2134 * dev replace case, otherwise write here in dev replace 2387 * dev replace case, otherwise write here in dev replace
2135 * case. 2388 * case.
2136 */ 2389 */
2137 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) 2390 corrupted = scrub_checksum(sblock);
2391 if (!corrupted && sblock->sctx->is_dev_replace)
2138 scrub_write_block_to_dev_replace(sblock); 2392 scrub_write_block_to_dev_replace(sblock);
2139 } 2393 }
2394
2395 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2396 u64 start = sblock->pagev[0]->logical;
2397 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2398 PAGE_SIZE;
2399
2400 scrub_parity_mark_sectors_error(sblock->sparity,
2401 start, end - start);
2402 }
2140} 2403}
2141 2404
2142static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, 2405static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -2228,6 +2491,132 @@ behind_scrub_pages:
2228 return 0; 2491 return 0;
2229} 2492}
2230 2493
2494static int scrub_pages_for_parity(struct scrub_parity *sparity,
2495 u64 logical, u64 len,
2496 u64 physical, struct btrfs_device *dev,
2497 u64 flags, u64 gen, int mirror_num, u8 *csum)
2498{
2499 struct scrub_ctx *sctx = sparity->sctx;
2500 struct scrub_block *sblock;
2501 int index;
2502
2503 sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2504 if (!sblock) {
2505 spin_lock(&sctx->stat_lock);
2506 sctx->stat.malloc_errors++;
2507 spin_unlock(&sctx->stat_lock);
2508 return -ENOMEM;
2509 }
2510
2511 /* one ref inside this function, plus one for each page added to
2512 * a bio later on */
2513 atomic_set(&sblock->ref_count, 1);
2514 sblock->sctx = sctx;
2515 sblock->no_io_error_seen = 1;
2516 sblock->sparity = sparity;
2517 scrub_parity_get(sparity);
2518
2519 for (index = 0; len > 0; index++) {
2520 struct scrub_page *spage;
2521 u64 l = min_t(u64, len, PAGE_SIZE);
2522
2523 spage = kzalloc(sizeof(*spage), GFP_NOFS);
2524 if (!spage) {
2525leave_nomem:
2526 spin_lock(&sctx->stat_lock);
2527 sctx->stat.malloc_errors++;
2528 spin_unlock(&sctx->stat_lock);
2529 scrub_block_put(sblock);
2530 return -ENOMEM;
2531 }
2532 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2533 /* For scrub block */
2534 scrub_page_get(spage);
2535 sblock->pagev[index] = spage;
2536 /* For scrub parity */
2537 scrub_page_get(spage);
2538 list_add_tail(&spage->list, &sparity->spages);
2539 spage->sblock = sblock;
2540 spage->dev = dev;
2541 spage->flags = flags;
2542 spage->generation = gen;
2543 spage->logical = logical;
2544 spage->physical = physical;
2545 spage->mirror_num = mirror_num;
2546 if (csum) {
2547 spage->have_csum = 1;
2548 memcpy(spage->csum, csum, sctx->csum_size);
2549 } else {
2550 spage->have_csum = 0;
2551 }
2552 sblock->page_count++;
2553 spage->page = alloc_page(GFP_NOFS);
2554 if (!spage->page)
2555 goto leave_nomem;
2556 len -= l;
2557 logical += l;
2558 physical += l;
2559 }
2560
2561 WARN_ON(sblock->page_count == 0);
2562 for (index = 0; index < sblock->page_count; index++) {
2563 struct scrub_page *spage = sblock->pagev[index];
2564 int ret;
2565
2566 ret = scrub_add_page_to_rd_bio(sctx, spage);
2567 if (ret) {
2568 scrub_block_put(sblock);
2569 return ret;
2570 }
2571 }
2572
2573 /* last one frees, either here or in bio completion for last page */
2574 scrub_block_put(sblock);
2575 return 0;
2576}
2577
2578static int scrub_extent_for_parity(struct scrub_parity *sparity,
2579 u64 logical, u64 len,
2580 u64 physical, struct btrfs_device *dev,
2581 u64 flags, u64 gen, int mirror_num)
2582{
2583 struct scrub_ctx *sctx = sparity->sctx;
2584 int ret;
2585 u8 csum[BTRFS_CSUM_SIZE];
2586 u32 blocksize;
2587
2588 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2589 blocksize = sctx->sectorsize;
2590 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2591 blocksize = sctx->nodesize;
2592 } else {
2593 blocksize = sctx->sectorsize;
2594 WARN_ON(1);
2595 }
2596
2597 while (len) {
2598 u64 l = min_t(u64, len, blocksize);
2599 int have_csum = 0;
2600
2601 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2602 /* push csums to sbio */
2603 have_csum = scrub_find_csum(sctx, logical, l, csum);
2604 if (have_csum == 0)
2605 goto skip;
2606 }
2607 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2608 flags, gen, mirror_num,
2609 have_csum ? csum : NULL);
2610skip:
2611 if (ret)
2612 return ret;
2613 len -= l;
2614 logical += l;
2615 physical += l;
2616 }
2617 return 0;
2618}
2619
2231/* 2620/*
2232 * Given a physical address, this will calculate it's 2621 * Given a physical address, this will calculate it's
2233 * logical offset. if this is a parity stripe, it will return 2622 * logical offset. if this is a parity stripe, it will return
@@ -2236,7 +2625,8 @@ behind_scrub_pages:
2236 * return 0 if it is a data stripe, 1 means parity stripe. 2625 * return 0 if it is a data stripe, 1 means parity stripe.
2237 */ 2626 */
2238static int get_raid56_logic_offset(u64 physical, int num, 2627static int get_raid56_logic_offset(u64 physical, int num,
2239 struct map_lookup *map, u64 *offset) 2628 struct map_lookup *map, u64 *offset,
2629 u64 *stripe_start)
2240{ 2630{
2241 int i; 2631 int i;
2242 int j = 0; 2632 int j = 0;
@@ -2247,6 +2637,9 @@ static int get_raid56_logic_offset(u64 physical, int num,
2247 2637
2248 last_offset = (physical - map->stripes[num].physical) * 2638 last_offset = (physical - map->stripes[num].physical) *
2249 nr_data_stripes(map); 2639 nr_data_stripes(map);
2640 if (stripe_start)
2641 *stripe_start = last_offset;
2642
2250 *offset = last_offset; 2643 *offset = last_offset;
2251 for (i = 0; i < nr_data_stripes(map); i++) { 2644 for (i = 0; i < nr_data_stripes(map); i++) {
2252 *offset = last_offset + i * map->stripe_len; 2645 *offset = last_offset + i * map->stripe_len;
@@ -2269,13 +2662,330 @@ static int get_raid56_logic_offset(u64 physical, int num,
2269 return 1; 2662 return 1;
2270} 2663}
2271 2664
2665static void scrub_free_parity(struct scrub_parity *sparity)
2666{
2667 struct scrub_ctx *sctx = sparity->sctx;
2668 struct scrub_page *curr, *next;
2669 int nbits;
2670
2671 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2672 if (nbits) {
2673 spin_lock(&sctx->stat_lock);
2674 sctx->stat.read_errors += nbits;
2675 sctx->stat.uncorrectable_errors += nbits;
2676 spin_unlock(&sctx->stat_lock);
2677 }
2678
2679 list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2680 list_del_init(&curr->list);
2681 scrub_page_put(curr);
2682 }
2683
2684 kfree(sparity);
2685}
2686
2687static void scrub_parity_bio_endio(struct bio *bio, int error)
2688{
2689 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2690 struct scrub_ctx *sctx = sparity->sctx;
2691
2692 if (error)
2693 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2694 sparity->nsectors);
2695
2696 scrub_free_parity(sparity);
2697 scrub_pending_bio_dec(sctx);
2698 bio_put(bio);
2699}
2700
2701static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2702{
2703 struct scrub_ctx *sctx = sparity->sctx;
2704 struct bio *bio;
2705 struct btrfs_raid_bio *rbio;
2706 struct scrub_page *spage;
2707 struct btrfs_bio *bbio = NULL;
2708 u64 *raid_map = NULL;
2709 u64 length;
2710 int ret;
2711
2712 if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2713 sparity->nsectors))
2714 goto out;
2715
2716 length = sparity->logic_end - sparity->logic_start + 1;
2717 ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
2718 sparity->logic_start,
2719 &length, &bbio, 0, &raid_map);
2720 if (ret || !bbio || !raid_map)
2721 goto bbio_out;
2722
2723 bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2724 if (!bio)
2725 goto bbio_out;
2726
2727 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2728 bio->bi_private = sparity;
2729 bio->bi_end_io = scrub_parity_bio_endio;
2730
2731 rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
2732 raid_map, length,
2733 sparity->scrub_dev,
2734 sparity->dbitmap,
2735 sparity->nsectors);
2736 if (!rbio)
2737 goto rbio_out;
2738
2739 list_for_each_entry(spage, &sparity->spages, list)
2740 raid56_parity_add_scrub_pages(rbio, spage->page,
2741 spage->logical);
2742
2743 scrub_pending_bio_inc(sctx);
2744 raid56_parity_submit_scrub_rbio(rbio);
2745 return;
2746
2747rbio_out:
2748 bio_put(bio);
2749bbio_out:
2750 kfree(bbio);
2751 kfree(raid_map);
2752 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2753 sparity->nsectors);
2754 spin_lock(&sctx->stat_lock);
2755 sctx->stat.malloc_errors++;
2756 spin_unlock(&sctx->stat_lock);
2757out:
2758 scrub_free_parity(sparity);
2759}
2760
2761static inline int scrub_calc_parity_bitmap_len(int nsectors)
2762{
2763 return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
2764}
2765
2766static void scrub_parity_get(struct scrub_parity *sparity)
2767{
2768 atomic_inc(&sparity->ref_count);
2769}
2770
2771static void scrub_parity_put(struct scrub_parity *sparity)
2772{
2773 if (!atomic_dec_and_test(&sparity->ref_count))
2774 return;
2775
2776 scrub_parity_check_and_repair(sparity);
2777}
2778
2779static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2780 struct map_lookup *map,
2781 struct btrfs_device *sdev,
2782 struct btrfs_path *path,
2783 u64 logic_start,
2784 u64 logic_end)
2785{
2786 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2787 struct btrfs_root *root = fs_info->extent_root;
2788 struct btrfs_root *csum_root = fs_info->csum_root;
2789 struct btrfs_extent_item *extent;
2790 u64 flags;
2791 int ret;
2792 int slot;
2793 struct extent_buffer *l;
2794 struct btrfs_key key;
2795 u64 generation;
2796 u64 extent_logical;
2797 u64 extent_physical;
2798 u64 extent_len;
2799 struct btrfs_device *extent_dev;
2800 struct scrub_parity *sparity;
2801 int nsectors;
2802 int bitmap_len;
2803 int extent_mirror_num;
2804 int stop_loop = 0;
2805
2806 nsectors = map->stripe_len / root->sectorsize;
2807 bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2808 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2809 GFP_NOFS);
2810 if (!sparity) {
2811 spin_lock(&sctx->stat_lock);
2812 sctx->stat.malloc_errors++;
2813 spin_unlock(&sctx->stat_lock);
2814 return -ENOMEM;
2815 }
2816
2817 sparity->stripe_len = map->stripe_len;
2818 sparity->nsectors = nsectors;
2819 sparity->sctx = sctx;
2820 sparity->scrub_dev = sdev;
2821 sparity->logic_start = logic_start;
2822 sparity->logic_end = logic_end;
2823 atomic_set(&sparity->ref_count, 1);
2824 INIT_LIST_HEAD(&sparity->spages);
2825 sparity->dbitmap = sparity->bitmap;
2826 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2827
2828 ret = 0;
2829 while (logic_start < logic_end) {
2830 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2831 key.type = BTRFS_METADATA_ITEM_KEY;
2832 else
2833 key.type = BTRFS_EXTENT_ITEM_KEY;
2834 key.objectid = logic_start;
2835 key.offset = (u64)-1;
2836
2837 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2838 if (ret < 0)
2839 goto out;
2840
2841 if (ret > 0) {
2842 ret = btrfs_previous_extent_item(root, path, 0);
2843 if (ret < 0)
2844 goto out;
2845 if (ret > 0) {
2846 btrfs_release_path(path);
2847 ret = btrfs_search_slot(NULL, root, &key,
2848 path, 0, 0);
2849 if (ret < 0)
2850 goto out;
2851 }
2852 }
2853
2854 stop_loop = 0;
2855 while (1) {
2856 u64 bytes;
2857
2858 l = path->nodes[0];
2859 slot = path->slots[0];
2860 if (slot >= btrfs_header_nritems(l)) {
2861 ret = btrfs_next_leaf(root, path);
2862 if (ret == 0)
2863 continue;
2864 if (ret < 0)
2865 goto out;
2866
2867 stop_loop = 1;
2868 break;
2869 }
2870 btrfs_item_key_to_cpu(l, &key, slot);
2871
2872 if (key.type == BTRFS_METADATA_ITEM_KEY)
2873 bytes = root->nodesize;
2874 else
2875 bytes = key.offset;
2876
2877 if (key.objectid + bytes <= logic_start)
2878 goto next;
2879
2880 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2881 key.type != BTRFS_METADATA_ITEM_KEY)
2882 goto next;
2883
2884 if (key.objectid > logic_end) {
2885 stop_loop = 1;
2886 break;
2887 }
2888
2889 while (key.objectid >= logic_start + map->stripe_len)
2890 logic_start += map->stripe_len;
2891
2892 extent = btrfs_item_ptr(l, slot,
2893 struct btrfs_extent_item);
2894 flags = btrfs_extent_flags(l, extent);
2895 generation = btrfs_extent_generation(l, extent);
2896
2897 if (key.objectid < logic_start &&
2898 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2899 btrfs_err(fs_info,
2900 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2901 key.objectid, logic_start);
2902 goto next;
2903 }
2904again:
2905 extent_logical = key.objectid;
2906 extent_len = bytes;
2907
2908 if (extent_logical < logic_start) {
2909 extent_len -= logic_start - extent_logical;
2910 extent_logical = logic_start;
2911 }
2912
2913 if (extent_logical + extent_len >
2914 logic_start + map->stripe_len)
2915 extent_len = logic_start + map->stripe_len -
2916 extent_logical;
2917
2918 scrub_parity_mark_sectors_data(sparity, extent_logical,
2919 extent_len);
2920
2921 scrub_remap_extent(fs_info, extent_logical,
2922 extent_len, &extent_physical,
2923 &extent_dev,
2924 &extent_mirror_num);
2925
2926 ret = btrfs_lookup_csums_range(csum_root,
2927 extent_logical,
2928 extent_logical + extent_len - 1,
2929 &sctx->csum_list, 1);
2930 if (ret)
2931 goto out;
2932
2933 ret = scrub_extent_for_parity(sparity, extent_logical,
2934 extent_len,
2935 extent_physical,
2936 extent_dev, flags,
2937 generation,
2938 extent_mirror_num);
2939 if (ret)
2940 goto out;
2941
2942 scrub_free_csums(sctx);
2943 if (extent_logical + extent_len <
2944 key.objectid + bytes) {
2945 logic_start += map->stripe_len;
2946
2947 if (logic_start >= logic_end) {
2948 stop_loop = 1;
2949 break;
2950 }
2951
2952 if (logic_start < key.objectid + bytes) {
2953 cond_resched();
2954 goto again;
2955 }
2956 }
2957next:
2958 path->slots[0]++;
2959 }
2960
2961 btrfs_release_path(path);
2962
2963 if (stop_loop)
2964 break;
2965
2966 logic_start += map->stripe_len;
2967 }
2968out:
2969 if (ret < 0)
2970 scrub_parity_mark_sectors_error(sparity, logic_start,
2971 logic_end - logic_start + 1);
2972 scrub_parity_put(sparity);
2973 scrub_submit(sctx);
2974 mutex_lock(&sctx->wr_ctx.wr_lock);
2975 scrub_wr_submit(sctx);
2976 mutex_unlock(&sctx->wr_ctx.wr_lock);
2977
2978 btrfs_release_path(path);
2979 return ret < 0 ? ret : 0;
2980}
2981
2272static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2982static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2273 struct map_lookup *map, 2983 struct map_lookup *map,
2274 struct btrfs_device *scrub_dev, 2984 struct btrfs_device *scrub_dev,
2275 int num, u64 base, u64 length, 2985 int num, u64 base, u64 length,
2276 int is_dev_replace) 2986 int is_dev_replace)
2277{ 2987{
2278 struct btrfs_path *path; 2988 struct btrfs_path *path, *ppath;
2279 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 2989 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2280 struct btrfs_root *root = fs_info->extent_root; 2990 struct btrfs_root *root = fs_info->extent_root;
2281 struct btrfs_root *csum_root = fs_info->csum_root; 2991 struct btrfs_root *csum_root = fs_info->csum_root;
@@ -2302,6 +3012,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2302 u64 extent_logical; 3012 u64 extent_logical;
2303 u64 extent_physical; 3013 u64 extent_physical;
2304 u64 extent_len; 3014 u64 extent_len;
3015 u64 stripe_logical;
3016 u64 stripe_end;
2305 struct btrfs_device *extent_dev; 3017 struct btrfs_device *extent_dev;
2306 int extent_mirror_num; 3018 int extent_mirror_num;
2307 int stop_loop = 0; 3019 int stop_loop = 0;
@@ -2327,7 +3039,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2327 mirror_num = num % map->num_stripes + 1; 3039 mirror_num = num % map->num_stripes + 1;
2328 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3040 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2329 BTRFS_BLOCK_GROUP_RAID6)) { 3041 BTRFS_BLOCK_GROUP_RAID6)) {
2330 get_raid56_logic_offset(physical, num, map, &offset); 3042 get_raid56_logic_offset(physical, num, map, &offset, NULL);
2331 increment = map->stripe_len * nr_data_stripes(map); 3043 increment = map->stripe_len * nr_data_stripes(map);
2332 mirror_num = 1; 3044 mirror_num = 1;
2333 } else { 3045 } else {
@@ -2339,6 +3051,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2339 if (!path) 3051 if (!path)
2340 return -ENOMEM; 3052 return -ENOMEM;
2341 3053
3054 ppath = btrfs_alloc_path();
3055 if (!ppath) {
3056 btrfs_free_path(ppath);
3057 return -ENOMEM;
3058 }
3059
2342 /* 3060 /*
2343 * work on commit root. The related disk blocks are static as 3061 * work on commit root. The related disk blocks are static as
2344 * long as COW is applied. This means, it is save to rewrite 3062 * long as COW is applied. This means, it is save to rewrite
@@ -2357,7 +3075,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2357 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3075 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2358 BTRFS_BLOCK_GROUP_RAID6)) { 3076 BTRFS_BLOCK_GROUP_RAID6)) {
2359 get_raid56_logic_offset(physical_end, num, 3077 get_raid56_logic_offset(physical_end, num,
2360 map, &logic_end); 3078 map, &logic_end, NULL);
2361 logic_end += base; 3079 logic_end += base;
2362 } else { 3080 } else {
2363 logic_end = logical + increment * nstripes; 3081 logic_end = logical + increment * nstripes;
@@ -2404,10 +3122,18 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2404 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 3122 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2405 BTRFS_BLOCK_GROUP_RAID6)) { 3123 BTRFS_BLOCK_GROUP_RAID6)) {
2406 ret = get_raid56_logic_offset(physical, num, 3124 ret = get_raid56_logic_offset(physical, num,
2407 map, &logical); 3125 map, &logical, &stripe_logical);
2408 logical += base; 3126 logical += base;
2409 if (ret) 3127 if (ret) {
3128 stripe_logical += base;
3129 stripe_end = stripe_logical + increment - 1;
3130 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3131 ppath, stripe_logical,
3132 stripe_end);
3133 if (ret)
3134 goto out;
2410 goto skip; 3135 goto skip;
3136 }
2411 } 3137 }
2412 /* 3138 /*
2413 * canceled? 3139 * canceled?
@@ -2558,13 +3284,25 @@ again:
2558 * loop until we find next data stripe 3284 * loop until we find next data stripe
2559 * or we have finished all stripes. 3285 * or we have finished all stripes.
2560 */ 3286 */
2561 do { 3287loop:
2562 physical += map->stripe_len; 3288 physical += map->stripe_len;
2563 ret = get_raid56_logic_offset( 3289 ret = get_raid56_logic_offset(physical,
2564 physical, num, 3290 num, map, &logical,
2565 map, &logical); 3291 &stripe_logical);
2566 logical += base; 3292 logical += base;
2567 } while (physical < physical_end && ret); 3293
3294 if (ret && physical < physical_end) {
3295 stripe_logical += base;
3296 stripe_end = stripe_logical +
3297 increment - 1;
3298 ret = scrub_raid56_parity(sctx,
3299 map, scrub_dev, ppath,
3300 stripe_logical,
3301 stripe_end);
3302 if (ret)
3303 goto out;
3304 goto loop;
3305 }
2568 } else { 3306 } else {
2569 physical += map->stripe_len; 3307 physical += map->stripe_len;
2570 logical += increment; 3308 logical += increment;
@@ -2605,6 +3343,7 @@ out:
2605 3343
2606 blk_finish_plug(&plug); 3344 blk_finish_plug(&plug);
2607 btrfs_free_path(path); 3345 btrfs_free_path(path);
3346 btrfs_free_path(ppath);
2608 return ret < 0 ? ret : 0; 3347 return ret < 0 ? ret : 0;
2609} 3348}
2610 3349
@@ -3310,6 +4049,50 @@ out:
3310 scrub_pending_trans_workers_dec(sctx); 4049 scrub_pending_trans_workers_dec(sctx);
3311} 4050}
3312 4051
4052static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
4053 u64 logical)
4054{
4055 struct extent_state *cached_state = NULL;
4056 struct btrfs_ordered_extent *ordered;
4057 struct extent_io_tree *io_tree;
4058 struct extent_map *em;
4059 u64 lockstart = start, lockend = start + len - 1;
4060 int ret = 0;
4061
4062 io_tree = &BTRFS_I(inode)->io_tree;
4063
4064 lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
4065 ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4066 if (ordered) {
4067 btrfs_put_ordered_extent(ordered);
4068 ret = 1;
4069 goto out_unlock;
4070 }
4071
4072 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4073 if (IS_ERR(em)) {
4074 ret = PTR_ERR(em);
4075 goto out_unlock;
4076 }
4077
4078 /*
4079 * This extent does not actually cover the logical extent anymore,
4080 * move on to the next inode.
4081 */
4082 if (em->block_start > logical ||
4083 em->block_start + em->block_len < logical + len) {
4084 free_extent_map(em);
4085 ret = 1;
4086 goto out_unlock;
4087 }
4088 free_extent_map(em);
4089
4090out_unlock:
4091 unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
4092 GFP_NOFS);
4093 return ret;
4094}
4095
3313static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, 4096static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3314 struct scrub_copy_nocow_ctx *nocow_ctx) 4097 struct scrub_copy_nocow_ctx *nocow_ctx)
3315{ 4098{
@@ -3318,13 +4101,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3318 struct inode *inode; 4101 struct inode *inode;
3319 struct page *page; 4102 struct page *page;
3320 struct btrfs_root *local_root; 4103 struct btrfs_root *local_root;
3321 struct btrfs_ordered_extent *ordered;
3322 struct extent_map *em;
3323 struct extent_state *cached_state = NULL;
3324 struct extent_io_tree *io_tree; 4104 struct extent_io_tree *io_tree;
3325 u64 physical_for_dev_replace; 4105 u64 physical_for_dev_replace;
4106 u64 nocow_ctx_logical;
3326 u64 len = nocow_ctx->len; 4107 u64 len = nocow_ctx->len;
3327 u64 lockstart = offset, lockend = offset + len - 1;
3328 unsigned long index; 4108 unsigned long index;
3329 int srcu_index; 4109 int srcu_index;
3330 int ret = 0; 4110 int ret = 0;
@@ -3356,30 +4136,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3356 4136
3357 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 4137 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3358 io_tree = &BTRFS_I(inode)->io_tree; 4138 io_tree = &BTRFS_I(inode)->io_tree;
4139 nocow_ctx_logical = nocow_ctx->logical;
3359 4140
3360 lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); 4141 ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
3361 ordered = btrfs_lookup_ordered_range(inode, lockstart, len); 4142 if (ret) {
3362 if (ordered) { 4143 ret = ret > 0 ? 0 : ret;
3363 btrfs_put_ordered_extent(ordered); 4144 goto out;
3364 goto out_unlock;
3365 }
3366
3367 em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
3368 if (IS_ERR(em)) {
3369 ret = PTR_ERR(em);
3370 goto out_unlock;
3371 }
3372
3373 /*
3374 * This extent does not actually cover the logical extent anymore,
3375 * move on to the next inode.
3376 */
3377 if (em->block_start > nocow_ctx->logical ||
3378 em->block_start + em->block_len < nocow_ctx->logical + len) {
3379 free_extent_map(em);
3380 goto out_unlock;
3381 } 4145 }
3382 free_extent_map(em);
3383 4146
3384 while (len >= PAGE_CACHE_SIZE) { 4147 while (len >= PAGE_CACHE_SIZE) {
3385 index = offset >> PAGE_CACHE_SHIFT; 4148 index = offset >> PAGE_CACHE_SHIFT;
@@ -3396,7 +4159,7 @@ again:
3396 goto next_page; 4159 goto next_page;
3397 } else { 4160 } else {
3398 ClearPageError(page); 4161 ClearPageError(page);
3399 err = extent_read_full_page_nolock(io_tree, page, 4162 err = extent_read_full_page(io_tree, page,
3400 btrfs_get_extent, 4163 btrfs_get_extent,
3401 nocow_ctx->mirror_num); 4164 nocow_ctx->mirror_num);
3402 if (err) { 4165 if (err) {
@@ -3421,6 +4184,14 @@ again:
3421 goto next_page; 4184 goto next_page;
3422 } 4185 }
3423 } 4186 }
4187
4188 ret = check_extent_to_block(inode, offset, len,
4189 nocow_ctx_logical);
4190 if (ret) {
4191 ret = ret > 0 ? 0 : ret;
4192 goto next_page;
4193 }
4194
3424 err = write_page_nocow(nocow_ctx->sctx, 4195 err = write_page_nocow(nocow_ctx->sctx,
3425 physical_for_dev_replace, page); 4196 physical_for_dev_replace, page);
3426 if (err) 4197 if (err)
@@ -3434,12 +4205,10 @@ next_page:
3434 4205
3435 offset += PAGE_CACHE_SIZE; 4206 offset += PAGE_CACHE_SIZE;
3436 physical_for_dev_replace += PAGE_CACHE_SIZE; 4207 physical_for_dev_replace += PAGE_CACHE_SIZE;
4208 nocow_ctx_logical += PAGE_CACHE_SIZE;
3437 len -= PAGE_CACHE_SIZE; 4209 len -= PAGE_CACHE_SIZE;
3438 } 4210 }
3439 ret = COPY_COMPLETE; 4211 ret = COPY_COMPLETE;
3440out_unlock:
3441 unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
3442 GFP_NOFS);
3443out: 4212out:
3444 mutex_unlock(&inode->i_mutex); 4213 mutex_unlock(&inode->i_mutex);
3445 iput(inode); 4214 iput(inode);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 874828dd0a86..804432dbc351 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5507,6 +5507,51 @@ out:
5507 return ret; 5507 return ret;
5508} 5508}
5509 5509
5510/*
5511 * If orphan cleanup did remove any orphans from a root, it means the tree
5512 * was modified and therefore the commit root is not the same as the current
5513 * root anymore. This is a problem, because send uses the commit root and
5514 * therefore can see inode items that don't exist in the current root anymore,
5515 * and for example make calls to btrfs_iget, which will do tree lookups based
5516 * on the current root and not on the commit root. Those lookups will fail,
5517 * returning a -ESTALE error, and making send fail with that error. So make
5518 * sure a send does not see any orphans we have just removed, and that it will
5519 * see the same inodes regardless of whether a transaction commit happened
5520 * before it started (meaning that the commit root will be the same as the
5521 * current root) or not.
5522 */
5523static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
5524{
5525 int i;
5526 struct btrfs_trans_handle *trans = NULL;
5527
5528again:
5529 if (sctx->parent_root &&
5530 sctx->parent_root->node != sctx->parent_root->commit_root)
5531 goto commit_trans;
5532
5533 for (i = 0; i < sctx->clone_roots_cnt; i++)
5534 if (sctx->clone_roots[i].root->node !=
5535 sctx->clone_roots[i].root->commit_root)
5536 goto commit_trans;
5537
5538 if (trans)
5539 return btrfs_end_transaction(trans, sctx->send_root);
5540
5541 return 0;
5542
5543commit_trans:
5544 /* Use any root, all fs roots will get their commit roots updated. */
5545 if (!trans) {
5546 trans = btrfs_join_transaction(sctx->send_root);
5547 if (IS_ERR(trans))
5548 return PTR_ERR(trans);
5549 goto again;
5550 }
5551
5552 return btrfs_commit_transaction(trans, sctx->send_root);
5553}
5554
5510static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) 5555static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
5511{ 5556{
5512 spin_lock(&root->root_item_lock); 5557 spin_lock(&root->root_item_lock);
@@ -5728,6 +5773,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5728 NULL); 5773 NULL);
5729 sort_clone_roots = 1; 5774 sort_clone_roots = 1;
5730 5775
5776 ret = ensure_commit_roots_uptodate(sctx);
5777 if (ret)
5778 goto out;
5779
5731 current->journal_info = BTRFS_SEND_TRANS_STUB; 5780 current->journal_info = BTRFS_SEND_TRANS_STUB;
5732 ret = send_subvol(sctx); 5781 ret = send_subvol(sctx);
5733 current->journal_info = NULL; 5782 current->journal_info = NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 54bd91ece35b..60f7cbe815e9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -262,7 +262,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
262 trans->aborted = errno; 262 trans->aborted = errno;
263 /* Nothing used. The other threads that have joined this 263 /* Nothing used. The other threads that have joined this
264 * transaction may be able to continue. */ 264 * transaction may be able to continue. */
265 if (!trans->blocks_used) { 265 if (!trans->blocks_used && list_empty(&trans->new_bgs)) {
266 const char *errstr; 266 const char *errstr;
267 267
268 errstr = btrfs_decode_error(errno); 268 errstr = btrfs_decode_error(errno);
@@ -642,11 +642,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
642 "disabling disk space caching"); 642 "disabling disk space caching");
643 break; 643 break;
644 case Opt_inode_cache: 644 case Opt_inode_cache:
645 btrfs_set_and_info(root, CHANGE_INODE_CACHE, 645 btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
646 "enabling inode map caching"); 646 "enabling inode map caching");
647 break; 647 break;
648 case Opt_noinode_cache: 648 case Opt_noinode_cache:
649 btrfs_clear_and_info(root, CHANGE_INODE_CACHE, 649 btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
650 "disabling inode map caching"); 650 "disabling inode map caching");
651 break; 651 break;
652 case Opt_clear_cache: 652 case Opt_clear_cache:
@@ -993,9 +993,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
993 trans = btrfs_attach_transaction_barrier(root); 993 trans = btrfs_attach_transaction_barrier(root);
994 if (IS_ERR(trans)) { 994 if (IS_ERR(trans)) {
995 /* no transaction, don't bother */ 995 /* no transaction, don't bother */
996 if (PTR_ERR(trans) == -ENOENT) 996 if (PTR_ERR(trans) == -ENOENT) {
997 return 0; 997 /*
998 return PTR_ERR(trans); 998 * Exit unless we have some pending changes
999 * that need to go through commit
1000 */
1001 if (fs_info->pending_changes == 0)
1002 return 0;
1003 trans = btrfs_start_transaction(root, 0);
1004 } else {
1005 return PTR_ERR(trans);
1006 }
999 } 1007 }
1000 return btrfs_commit_transaction(trans, root); 1008 return btrfs_commit_transaction(trans, root);
1001} 1009}
@@ -1644,8 +1652,20 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1644 int i = 0, nr_devices; 1652 int i = 0, nr_devices;
1645 int ret; 1653 int ret;
1646 1654
1655 /*
1656 * We aren't under the device list lock, so this is racey-ish, but good
1657 * enough for our purposes.
1658 */
1647 nr_devices = fs_info->fs_devices->open_devices; 1659 nr_devices = fs_info->fs_devices->open_devices;
1648 BUG_ON(!nr_devices); 1660 if (!nr_devices) {
1661 smp_mb();
1662 nr_devices = fs_info->fs_devices->open_devices;
1663 ASSERT(nr_devices);
1664 if (!nr_devices) {
1665 *free_bytes = 0;
1666 return 0;
1667 }
1668 }
1649 1669
1650 devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), 1670 devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
1651 GFP_NOFS); 1671 GFP_NOFS);
@@ -1670,11 +1690,17 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1670 else 1690 else
1671 min_stripe_size = BTRFS_STRIPE_LEN; 1691 min_stripe_size = BTRFS_STRIPE_LEN;
1672 1692
1673 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1693 if (fs_info->alloc_start)
1694 mutex_lock(&fs_devices->device_list_mutex);
1695 rcu_read_lock();
1696 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
1674 if (!device->in_fs_metadata || !device->bdev || 1697 if (!device->in_fs_metadata || !device->bdev ||
1675 device->is_tgtdev_for_dev_replace) 1698 device->is_tgtdev_for_dev_replace)
1676 continue; 1699 continue;
1677 1700
1701 if (i >= nr_devices)
1702 break;
1703
1678 avail_space = device->total_bytes - device->bytes_used; 1704 avail_space = device->total_bytes - device->bytes_used;
1679 1705
1680 /* align with stripe_len */ 1706 /* align with stripe_len */
@@ -1689,24 +1715,32 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1689 skip_space = 1024 * 1024; 1715 skip_space = 1024 * 1024;
1690 1716
1691 /* user can set the offset in fs_info->alloc_start. */ 1717 /* user can set the offset in fs_info->alloc_start. */
1692 if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= 1718 if (fs_info->alloc_start &&
1693 device->total_bytes) 1719 fs_info->alloc_start + BTRFS_STRIPE_LEN <=
1720 device->total_bytes) {
1721 rcu_read_unlock();
1694 skip_space = max(fs_info->alloc_start, skip_space); 1722 skip_space = max(fs_info->alloc_start, skip_space);
1695 1723
1696 /* 1724 /*
1697 * btrfs can not use the free space in [0, skip_space - 1], 1725 * btrfs can not use the free space in
1698 * we must subtract it from the total. In order to implement 1726 * [0, skip_space - 1], we must subtract it from the
1699 * it, we account the used space in this range first. 1727 * total. In order to implement it, we account the used
1700 */ 1728 * space in this range first.
1701 ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, 1729 */
1702 &used_space); 1730 ret = btrfs_account_dev_extents_size(device, 0,
1703 if (ret) { 1731 skip_space - 1,
1704 kfree(devices_info); 1732 &used_space);
1705 return ret; 1733 if (ret) {
1706 } 1734 kfree(devices_info);
1735 mutex_unlock(&fs_devices->device_list_mutex);
1736 return ret;
1737 }
1707 1738
1708 /* calc the free space in [0, skip_space - 1] */ 1739 rcu_read_lock();
1709 skip_space -= used_space; 1740
1741 /* calc the free space in [0, skip_space - 1] */
1742 skip_space -= used_space;
1743 }
1710 1744
1711 /* 1745 /*
1712 * we can use the free space in [0, skip_space - 1], subtract 1746 * we can use the free space in [0, skip_space - 1], subtract
@@ -1725,6 +1759,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1725 1759
1726 i++; 1760 i++;
1727 } 1761 }
1762 rcu_read_unlock();
1763 if (fs_info->alloc_start)
1764 mutex_unlock(&fs_devices->device_list_mutex);
1728 1765
1729 nr_devices = i; 1766 nr_devices = i;
1730 1767
@@ -1787,8 +1824,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1787 * holding chunk_muext to avoid allocating new chunks, holding 1824 * holding chunk_muext to avoid allocating new chunks, holding
1788 * device_list_mutex to avoid the device being removed 1825 * device_list_mutex to avoid the device being removed
1789 */ 1826 */
1790 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1791 mutex_lock(&fs_info->chunk_mutex);
1792 rcu_read_lock(); 1827 rcu_read_lock();
1793 list_for_each_entry_rcu(found, head, list) { 1828 list_for_each_entry_rcu(found, head, list) {
1794 if (found->flags & BTRFS_BLOCK_GROUP_DATA) { 1829 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1824,17 +1859,12 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1824 buf->f_bfree -= block_rsv->size >> bits; 1859 buf->f_bfree -= block_rsv->size >> bits;
1825 spin_unlock(&block_rsv->lock); 1860 spin_unlock(&block_rsv->lock);
1826 1861
1827 buf->f_bavail = total_free_data; 1862 buf->f_bavail = div_u64(total_free_data, factor);
1828 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); 1863 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
1829 if (ret) { 1864 if (ret)
1830 mutex_unlock(&fs_info->chunk_mutex);
1831 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1832 return ret; 1865 return ret;
1833 }
1834 buf->f_bavail += div_u64(total_free_data, factor); 1866 buf->f_bavail += div_u64(total_free_data, factor);
1835 buf->f_bavail = buf->f_bavail >> bits; 1867 buf->f_bavail = buf->f_bavail >> bits;
1836 mutex_unlock(&fs_info->chunk_mutex);
1837 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1838 1868
1839 buf->f_type = BTRFS_SUPER_MAGIC; 1869 buf->f_type = BTRFS_SUPER_MAGIC;
1840 buf->f_bsize = dentry->d_sb->s_blocksize; 1870 buf->f_bsize = dentry->d_sb->s_blocksize;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b2e7bb4393f6..92db3f648df4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -111,7 +111,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
111{ 111{
112 struct btrfs_fs_info *fs_info; 112 struct btrfs_fs_info *fs_info;
113 struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); 113 struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
114 struct btrfs_trans_handle *trans;
115 u64 features, set, clear; 114 u64 features, set, clear;
116 unsigned long val; 115 unsigned long val;
117 int ret; 116 int ret;
@@ -153,10 +152,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
153 btrfs_info(fs_info, "%s %s feature flag", 152 btrfs_info(fs_info, "%s %s feature flag",
154 val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); 153 val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
155 154
156 trans = btrfs_start_transaction(fs_info->fs_root, 0);
157 if (IS_ERR(trans))
158 return PTR_ERR(trans);
159
160 spin_lock(&fs_info->super_lock); 155 spin_lock(&fs_info->super_lock);
161 features = get_features(fs_info, fa->feature_set); 156 features = get_features(fs_info, fa->feature_set);
162 if (val) 157 if (val)
@@ -166,9 +161,11 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
166 set_features(fs_info, fa->feature_set, features); 161 set_features(fs_info, fa->feature_set, features);
167 spin_unlock(&fs_info->super_lock); 162 spin_unlock(&fs_info->super_lock);
168 163
169 ret = btrfs_commit_transaction(trans, fs_info->fs_root); 164 /*
170 if (ret) 165 * We don't want to do full transaction commit from inside sysfs
171 return ret; 166 */
167 btrfs_set_pending(fs_info, COMMIT);
168 wake_up_process(fs_info->transaction_kthread);
172 169
173 return count; 170 return count;
174} 171}
@@ -372,9 +369,6 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
372 const char *buf, size_t len) 369 const char *buf, size_t len)
373{ 370{
374 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 371 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
375 struct btrfs_trans_handle *trans;
376 struct btrfs_root *root = fs_info->fs_root;
377 int ret;
378 size_t p_len; 372 size_t p_len;
379 373
380 if (fs_info->sb->s_flags & MS_RDONLY) 374 if (fs_info->sb->s_flags & MS_RDONLY)
@@ -389,20 +383,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
389 if (p_len >= BTRFS_LABEL_SIZE) 383 if (p_len >= BTRFS_LABEL_SIZE)
390 return -EINVAL; 384 return -EINVAL;
391 385
392 trans = btrfs_start_transaction(root, 0); 386 spin_lock(&fs_info->super_lock);
393 if (IS_ERR(trans))
394 return PTR_ERR(trans);
395
396 spin_lock(&root->fs_info->super_lock);
397 memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); 387 memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
398 memcpy(fs_info->super_copy->label, buf, p_len); 388 memcpy(fs_info->super_copy->label, buf, p_len);
399 spin_unlock(&root->fs_info->super_lock); 389 spin_unlock(&fs_info->super_lock);
400 ret = btrfs_commit_transaction(trans, root);
401 390
402 if (!ret) 391 /*
403 return len; 392 * We don't want to do full transaction commit from inside sysfs
393 */
394 btrfs_set_pending(fs_info, COMMIT);
395 wake_up_process(fs_info->transaction_kthread);
404 396
405 return ret; 397 return len;
406} 398}
407BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); 399BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
408 400
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dcaae3616728..a605d4e2f2bc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
76 } 76 }
77} 77}
78 78
79static void clear_btree_io_tree(struct extent_io_tree *tree)
80{
81 spin_lock(&tree->lock);
82 while (!RB_EMPTY_ROOT(&tree->state)) {
83 struct rb_node *node;
84 struct extent_state *state;
85
86 node = rb_first(&tree->state);
87 state = rb_entry(node, struct extent_state, rb_node);
88 rb_erase(&state->rb_node, &tree->state);
89 RB_CLEAR_NODE(&state->rb_node);
90 /*
91 * btree io trees aren't supposed to have tasks waiting for
92 * changes in the flags of extent states ever.
93 */
94 ASSERT(!waitqueue_active(&state->wq));
95 free_extent_state(state);
96 if (need_resched()) {
97 spin_unlock(&tree->lock);
98 cond_resched();
99 spin_lock(&tree->lock);
100 }
101 }
102 spin_unlock(&tree->lock);
103}
104
79static noinline void switch_commit_roots(struct btrfs_transaction *trans, 105static noinline void switch_commit_roots(struct btrfs_transaction *trans,
80 struct btrfs_fs_info *fs_info) 106 struct btrfs_fs_info *fs_info)
81{ 107{
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
89 root->commit_root = btrfs_root_node(root); 115 root->commit_root = btrfs_root_node(root);
90 if (is_fstree(root->objectid)) 116 if (is_fstree(root->objectid))
91 btrfs_unpin_free_ino(root); 117 btrfs_unpin_free_ino(root);
118 clear_btree_io_tree(&root->dirty_log_pages);
92 } 119 }
93 up_write(&fs_info->commit_root_sem); 120 up_write(&fs_info->commit_root_sem);
94} 121}
@@ -220,6 +247,7 @@ loop:
220 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 247 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
221 INIT_LIST_HEAD(&cur_trans->pending_chunks); 248 INIT_LIST_HEAD(&cur_trans->pending_chunks);
222 INIT_LIST_HEAD(&cur_trans->switch_commits); 249 INIT_LIST_HEAD(&cur_trans->switch_commits);
250 INIT_LIST_HEAD(&cur_trans->pending_ordered);
223 list_add_tail(&cur_trans->list, &fs_info->trans_list); 251 list_add_tail(&cur_trans->list, &fs_info->trans_list);
224 extent_io_tree_init(&cur_trans->dirty_pages, 252 extent_io_tree_init(&cur_trans->dirty_pages,
225 fs_info->btree_inode->i_mapping); 253 fs_info->btree_inode->i_mapping);
@@ -488,6 +516,7 @@ again:
488 h->sync = false; 516 h->sync = false;
489 INIT_LIST_HEAD(&h->qgroup_ref_list); 517 INIT_LIST_HEAD(&h->qgroup_ref_list);
490 INIT_LIST_HEAD(&h->new_bgs); 518 INIT_LIST_HEAD(&h->new_bgs);
519 INIT_LIST_HEAD(&h->ordered);
491 520
492 smp_mb(); 521 smp_mb();
493 if (cur_trans->state >= TRANS_STATE_BLOCKED && 522 if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -719,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
719 if (!list_empty(&trans->new_bgs)) 748 if (!list_empty(&trans->new_bgs))
720 btrfs_create_pending_block_groups(trans, root); 749 btrfs_create_pending_block_groups(trans, root);
721 750
751 if (!list_empty(&trans->ordered)) {
752 spin_lock(&info->trans_lock);
753 list_splice(&trans->ordered, &cur_trans->pending_ordered);
754 spin_unlock(&info->trans_lock);
755 }
756
722 trans->delayed_ref_updates = 0; 757 trans->delayed_ref_updates = 0;
723 if (!trans->sync) { 758 if (!trans->sync) {
724 must_run_delayed_refs = 759 must_run_delayed_refs =
@@ -828,17 +863,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
828 863
829 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 864 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
830 mark, &cached_state)) { 865 mark, &cached_state)) {
831 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 866 bool wait_writeback = false;
832 mark, &cached_state, GFP_NOFS); 867
833 cached_state = NULL; 868 err = convert_extent_bit(dirty_pages, start, end,
834 err = filemap_fdatawrite_range(mapping, start, end); 869 EXTENT_NEED_WAIT,
870 mark, &cached_state, GFP_NOFS);
871 /*
872 * convert_extent_bit can return -ENOMEM, which is most of the
873 * time a temporary error. So when it happens, ignore the error
874 * and wait for writeback of this range to finish - because we
875 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
876 * to btrfs_wait_marked_extents() would not know that writeback
877 * for this range started and therefore wouldn't wait for it to
878 * finish - we don't want to commit a superblock that points to
879 * btree nodes/leafs for which writeback hasn't finished yet
880 * (and without errors).
881 * We cleanup any entries left in the io tree when committing
882 * the transaction (through clear_btree_io_tree()).
883 */
884 if (err == -ENOMEM) {
885 err = 0;
886 wait_writeback = true;
887 }
888 if (!err)
889 err = filemap_fdatawrite_range(mapping, start, end);
835 if (err) 890 if (err)
836 werr = err; 891 werr = err;
892 else if (wait_writeback)
893 werr = filemap_fdatawait_range(mapping, start, end);
894 free_extent_state(cached_state);
895 cached_state = NULL;
837 cond_resched(); 896 cond_resched();
838 start = end + 1; 897 start = end + 1;
839 } 898 }
840 if (err)
841 werr = err;
842 return werr; 899 return werr;
843} 900}
844 901
@@ -862,11 +919,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
862 919
863 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 920 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
864 EXTENT_NEED_WAIT, &cached_state)) { 921 EXTENT_NEED_WAIT, &cached_state)) {
865 clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 922 /*
866 0, 0, &cached_state, GFP_NOFS); 923 * Ignore -ENOMEM errors returned by clear_extent_bit().
867 err = filemap_fdatawait_range(mapping, start, end); 924 * When committing the transaction, we'll remove any entries
925 * left in the io tree. For a log commit, we don't remove them
926 * after committing the log because the tree can be accessed
927 * concurrently - we do it only at transaction commit time when
928 * it's safe to do it (through clear_btree_io_tree()).
929 */
930 err = clear_extent_bit(dirty_pages, start, end,
931 EXTENT_NEED_WAIT,
932 0, 0, &cached_state, GFP_NOFS);
933 if (err == -ENOMEM)
934 err = 0;
935 if (!err)
936 err = filemap_fdatawait_range(mapping, start, end);
868 if (err) 937 if (err)
869 werr = err; 938 werr = err;
939 free_extent_state(cached_state);
940 cached_state = NULL;
870 cond_resched(); 941 cond_resched();
871 start = end + 1; 942 start = end + 1;
872 } 943 }
@@ -919,17 +990,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
919 return 0; 990 return 0;
920} 991}
921 992
922int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 993static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
923 struct btrfs_root *root) 994 struct btrfs_root *root)
924{ 995{
925 if (!trans || !trans->transaction) { 996 int ret;
926 struct inode *btree_inode; 997
927 btree_inode = root->fs_info->btree_inode; 998 ret = btrfs_write_and_wait_marked_extents(root,
928 return filemap_write_and_wait(btree_inode->i_mapping);
929 }
930 return btrfs_write_and_wait_marked_extents(root,
931 &trans->transaction->dirty_pages, 999 &trans->transaction->dirty_pages,
932 EXTENT_DIRTY); 1000 EXTENT_DIRTY);
1001 clear_btree_io_tree(&trans->transaction->dirty_pages);
1002
1003 return ret;
933} 1004}
934 1005
935/* 1006/*
@@ -1652,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
1652 btrfs_wait_ordered_roots(fs_info, -1); 1723 btrfs_wait_ordered_roots(fs_info, -1);
1653} 1724}
1654 1725
1726static inline void
1727btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
1728 struct btrfs_fs_info *fs_info)
1729{
1730 struct btrfs_ordered_extent *ordered;
1731
1732 spin_lock(&fs_info->trans_lock);
1733 while (!list_empty(&cur_trans->pending_ordered)) {
1734 ordered = list_first_entry(&cur_trans->pending_ordered,
1735 struct btrfs_ordered_extent,
1736 trans_list);
1737 list_del_init(&ordered->trans_list);
1738 spin_unlock(&fs_info->trans_lock);
1739
1740 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
1741 &ordered->flags));
1742 btrfs_put_ordered_extent(ordered);
1743 spin_lock(&fs_info->trans_lock);
1744 }
1745 spin_unlock(&fs_info->trans_lock);
1746}
1747
1655int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1748int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1656 struct btrfs_root *root) 1749 struct btrfs_root *root)
1657{ 1750{
@@ -1702,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1702 } 1795 }
1703 1796
1704 spin_lock(&root->fs_info->trans_lock); 1797 spin_lock(&root->fs_info->trans_lock);
1798 list_splice(&trans->ordered, &cur_trans->pending_ordered);
1705 if (cur_trans->state >= TRANS_STATE_COMMIT_START) { 1799 if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
1706 spin_unlock(&root->fs_info->trans_lock); 1800 spin_unlock(&root->fs_info->trans_lock);
1707 atomic_inc(&cur_trans->use_count); 1801 atomic_inc(&cur_trans->use_count);
@@ -1754,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1754 1848
1755 btrfs_wait_delalloc_flush(root->fs_info); 1849 btrfs_wait_delalloc_flush(root->fs_info);
1756 1850
1851 btrfs_wait_pending_ordered(cur_trans, root->fs_info);
1852
1757 btrfs_scrub_pause(root); 1853 btrfs_scrub_pause(root);
1758 /* 1854 /*
1759 * Ok now we need to make sure to block out any other joins while we 1855 * Ok now we need to make sure to block out any other joins while we
@@ -1842,13 +1938,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1842 } 1938 }
1843 1939
1844 /* 1940 /*
1845 * Since the transaction is done, we should set the inode map cache flag 1941 * Since the transaction is done, we can apply the pending changes
1846 * before any other comming transaction. 1942 * before the next transaction.
1847 */ 1943 */
1848 if (btrfs_test_opt(root, CHANGE_INODE_CACHE)) 1944 btrfs_apply_pending_changes(root->fs_info);
1849 btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
1850 else
1851 btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
1852 1945
1853 /* commit_fs_roots gets rid of all the tree log roots, it is now 1946 /* commit_fs_roots gets rid of all the tree log roots, it is now
1854 * safe to free the root of tree log roots 1947 * safe to free the root of tree log roots
@@ -2019,3 +2112,32 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
2019 2112
2020 return (ret < 0) ? 0 : 1; 2113 return (ret < 0) ? 0 : 1;
2021} 2114}
2115
2116void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
2117{
2118 unsigned long prev;
2119 unsigned long bit;
2120
2121 prev = cmpxchg(&fs_info->pending_changes, 0, 0);
2122 if (!prev)
2123 return;
2124
2125 bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
2126 if (prev & bit)
2127 btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
2128 prev &= ~bit;
2129
2130 bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
2131 if (prev & bit)
2132 btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
2133 prev &= ~bit;
2134
2135 bit = 1 << BTRFS_PENDING_COMMIT;
2136 if (prev & bit)
2137 btrfs_debug(fs_info, "pending commit done");
2138 prev &= ~bit;
2139
2140 if (prev)
2141 btrfs_warn(fs_info,
2142 "unknown pending changes left 0x%lx, ignoring", prev);
2143}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d8f40e1a5d2d..00ed29c4b3f9 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -56,6 +56,7 @@ struct btrfs_transaction {
56 wait_queue_head_t commit_wait; 56 wait_queue_head_t commit_wait;
57 struct list_head pending_snapshots; 57 struct list_head pending_snapshots;
58 struct list_head pending_chunks; 58 struct list_head pending_chunks;
59 struct list_head pending_ordered;
59 struct list_head switch_commits; 60 struct list_head switch_commits;
60 struct btrfs_delayed_ref_root delayed_refs; 61 struct btrfs_delayed_ref_root delayed_refs;
61 int aborted; 62 int aborted;
@@ -105,6 +106,7 @@ struct btrfs_trans_handle {
105 */ 106 */
106 struct btrfs_root *root; 107 struct btrfs_root *root;
107 struct seq_list delayed_ref_elem; 108 struct seq_list delayed_ref_elem;
109 struct list_head ordered;
108 struct list_head qgroup_ref_list; 110 struct list_head qgroup_ref_list;
109 struct list_head new_bgs; 111 struct list_head new_bgs;
110}; 112};
@@ -145,8 +147,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
145 struct btrfs_root *root); 147 struct btrfs_root *root);
146struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); 148struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
147int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); 149int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
148int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
149 struct btrfs_root *root);
150 150
151void btrfs_add_dead_root(struct btrfs_root *root); 151void btrfs_add_dead_root(struct btrfs_root *root);
152int btrfs_defrag_root(struct btrfs_root *root); 152int btrfs_defrag_root(struct btrfs_root *root);
@@ -170,4 +170,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
170int btrfs_transaction_blocked(struct btrfs_fs_info *info); 170int btrfs_transaction_blocked(struct btrfs_fs_info *info);
171int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 171int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
172void btrfs_put_transaction(struct btrfs_transaction *transaction); 172void btrfs_put_transaction(struct btrfs_transaction *transaction);
173void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
174
173#endif 175#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 286213cec861..9a02da16f2be 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2599,12 +2599,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2599 index2 = root_log_ctx.log_transid % 2; 2599 index2 = root_log_ctx.log_transid % 2;
2600 if (atomic_read(&log_root_tree->log_commit[index2])) { 2600 if (atomic_read(&log_root_tree->log_commit[index2])) {
2601 blk_finish_plug(&plug); 2601 blk_finish_plug(&plug);
2602 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2602 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
2603 mark);
2604 btrfs_wait_logged_extents(trans, log, log_transid);
2603 wait_log_commit(trans, log_root_tree, 2605 wait_log_commit(trans, log_root_tree,
2604 root_log_ctx.log_transid); 2606 root_log_ctx.log_transid);
2605 btrfs_free_logged_extents(log, log_transid);
2606 mutex_unlock(&log_root_tree->log_mutex); 2607 mutex_unlock(&log_root_tree->log_mutex);
2607 ret = root_log_ctx.log_ret; 2608 if (!ret)
2609 ret = root_log_ctx.log_ret;
2608 goto out; 2610 goto out;
2609 } 2611 }
2610 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 2612 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
@@ -2641,11 +2643,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2641 mutex_unlock(&log_root_tree->log_mutex); 2643 mutex_unlock(&log_root_tree->log_mutex);
2642 goto out_wake_log_root; 2644 goto out_wake_log_root;
2643 } 2645 }
2644 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2646 ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2645 btrfs_wait_marked_extents(log_root_tree, 2647 if (!ret)
2646 &log_root_tree->dirty_log_pages, 2648 ret = btrfs_wait_marked_extents(log_root_tree,
2647 EXTENT_NEW | EXTENT_DIRTY); 2649 &log_root_tree->dirty_log_pages,
2648 btrfs_wait_logged_extents(log, log_transid); 2650 EXTENT_NEW | EXTENT_DIRTY);
2651 if (ret) {
2652 btrfs_set_log_full_commit(root->fs_info, trans);
2653 btrfs_free_logged_extents(log, log_transid);
2654 mutex_unlock(&log_root_tree->log_mutex);
2655 goto out_wake_log_root;
2656 }
2657 btrfs_wait_logged_extents(trans, log, log_transid);
2649 2658
2650 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2659 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2651 log_root_tree->node->start); 2660 log_root_tree->node->start);
@@ -3626,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
3626 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3635 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
3627 3636
3628 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3637 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3638 /*
3639 * Clear the AS_EIO/AS_ENOSPC flags from the inode's
3640 * i_mapping flags, so that the next fsync won't get
3641 * an outdated io error too.
3642 */
3643 btrfs_inode_check_errors(inode);
3629 *ordered_io_error = true; 3644 *ordered_io_error = true;
3630 break; 3645 break;
3631 } 3646 }
@@ -3766,7 +3781,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3766 fi = btrfs_item_ptr(leaf, path->slots[0], 3781 fi = btrfs_item_ptr(leaf, path->slots[0],
3767 struct btrfs_file_extent_item); 3782 struct btrfs_file_extent_item);
3768 3783
3769 btrfs_set_token_file_extent_generation(leaf, fi, em->generation, 3784 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
3770 &token); 3785 &token);
3771 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3786 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3772 btrfs_set_token_file_extent_type(leaf, fi, 3787 btrfs_set_token_file_extent_type(leaf, fi,
@@ -3963,7 +3978,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3963 3978
3964 mutex_lock(&BTRFS_I(inode)->log_mutex); 3979 mutex_lock(&BTRFS_I(inode)->log_mutex);
3965 3980
3966 btrfs_get_logged_extents(inode, &logged_list); 3981 btrfs_get_logged_extents(inode, &logged_list, start, end);
3967 3982
3968 /* 3983 /*
3969 * a brute force approach to making sure we get the most uptodate 3984 * a brute force approach to making sure we get the most uptodate
@@ -4089,6 +4104,21 @@ log_extents:
4089 btrfs_release_path(path); 4104 btrfs_release_path(path);
4090 btrfs_release_path(dst_path); 4105 btrfs_release_path(dst_path);
4091 if (fast_search) { 4106 if (fast_search) {
4107 /*
4108 * Some ordered extents started by fsync might have completed
4109 * before we collected the ordered extents in logged_list, which
4110 * means they're gone, not in our logged_list nor in the inode's
4111 * ordered tree. We want the application/user space to know an
4112 * error happened while attempting to persist file data so that
4113 * it can take proper action. If such error happened, we leave
4114 * without writing to the log tree and the fsync must report the
4115 * file data write error and not commit the current transaction.
4116 */
4117 err = btrfs_inode_check_errors(inode);
4118 if (err) {
4119 ctx->io_err = err;
4120 goto out_unlock;
4121 }
4092 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4122 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4093 &logged_list, ctx); 4123 &logged_list, ctx);
4094 if (ret) { 4124 if (ret) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d47289c715c8..0144790e296e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
53DEFINE_MUTEX(uuid_mutex); 53DEFINE_MUTEX(uuid_mutex);
54static LIST_HEAD(fs_uuids); 54static LIST_HEAD(fs_uuids);
55 55
56static void lock_chunks(struct btrfs_root *root)
57{
58 mutex_lock(&root->fs_info->chunk_mutex);
59}
60
61static void unlock_chunks(struct btrfs_root *root)
62{
63 mutex_unlock(&root->fs_info->chunk_mutex);
64}
65
66static struct btrfs_fs_devices *__alloc_fs_devices(void) 56static struct btrfs_fs_devices *__alloc_fs_devices(void)
67{ 57{
68 struct btrfs_fs_devices *fs_devs; 58 struct btrfs_fs_devices *fs_devs;
@@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
1068 u64 *start, u64 len) 1058 u64 *start, u64 len)
1069{ 1059{
1070 struct extent_map *em; 1060 struct extent_map *em;
1061 struct list_head *search_list = &trans->transaction->pending_chunks;
1071 int ret = 0; 1062 int ret = 0;
1072 1063
1073 list_for_each_entry(em, &trans->transaction->pending_chunks, list) { 1064again:
1065 list_for_each_entry(em, search_list, list) {
1074 struct map_lookup *map; 1066 struct map_lookup *map;
1075 int i; 1067 int i;
1076 1068
@@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
1087 ret = 1; 1079 ret = 1;
1088 } 1080 }
1089 } 1081 }
1082 if (search_list == &trans->transaction->pending_chunks) {
1083 search_list = &trans->root->fs_info->pinned_chunks;
1084 goto again;
1085 }
1090 1086
1091 return ret; 1087 return ret;
1092} 1088}
@@ -1800,8 +1796,8 @@ error_undo:
1800 goto error_brelse; 1796 goto error_brelse;
1801} 1797}
1802 1798
1803void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 1799void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
1804 struct btrfs_device *srcdev) 1800 struct btrfs_device *srcdev)
1805{ 1801{
1806 struct btrfs_fs_devices *fs_devices; 1802 struct btrfs_fs_devices *fs_devices;
1807 1803
@@ -1829,6 +1825,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1829 1825
1830 if (srcdev->bdev) 1826 if (srcdev->bdev)
1831 fs_devices->open_devices--; 1827 fs_devices->open_devices--;
1828}
1829
1830void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
1831 struct btrfs_device *srcdev)
1832{
1833 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
1832 1834
1833 call_rcu(&srcdev->rcu, free_device); 1835 call_rcu(&srcdev->rcu, free_device);
1834 1836
@@ -2647,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2647 } 2649 }
2648 } 2650 }
2649 2651
2650 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 2652 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
2651 if (ret) { 2653 if (ret) {
2652 btrfs_abort_transaction(trans, extent_root, ret); 2654 btrfs_abort_transaction(trans, extent_root, ret);
2653 goto out; 2655 goto out;
2654 } 2656 }
2655 2657
2656 write_lock(&em_tree->lock);
2657 remove_extent_mapping(em_tree, em);
2658 write_unlock(&em_tree->lock);
2659
2660 /* once for the tree */
2661 free_extent_map(em);
2662out: 2658out:
2663 /* once for us */ 2659 /* once for us */
2664 free_extent_map(em); 2660 free_extent_map(em);
@@ -4505,6 +4501,8 @@ error_del_extent:
4505 free_extent_map(em); 4501 free_extent_map(em);
4506 /* One for the tree reference */ 4502 /* One for the tree reference */
4507 free_extent_map(em); 4503 free_extent_map(em);
4504 /* One for the pending_chunks list reference */
4505 free_extent_map(em);
4508error: 4506error:
4509 kfree(devices_info); 4507 kfree(devices_info);
4510 return ret; 4508 return ret;
@@ -4881,13 +4879,15 @@ static inline int parity_smaller(u64 a, u64 b)
4881static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) 4879static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4882{ 4880{
4883 struct btrfs_bio_stripe s; 4881 struct btrfs_bio_stripe s;
4882 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
4884 int i; 4883 int i;
4885 u64 l; 4884 u64 l;
4886 int again = 1; 4885 int again = 1;
4886 int m;
4887 4887
4888 while (again) { 4888 while (again) {
4889 again = 0; 4889 again = 0;
4890 for (i = 0; i < bbio->num_stripes - 1; i++) { 4890 for (i = 0; i < real_stripes - 1; i++) {
4891 if (parity_smaller(raid_map[i], raid_map[i+1])) { 4891 if (parity_smaller(raid_map[i], raid_map[i+1])) {
4892 s = bbio->stripes[i]; 4892 s = bbio->stripes[i];
4893 l = raid_map[i]; 4893 l = raid_map[i];
@@ -4895,6 +4895,14 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4895 raid_map[i] = raid_map[i+1]; 4895 raid_map[i] = raid_map[i+1];
4896 bbio->stripes[i+1] = s; 4896 bbio->stripes[i+1] = s;
4897 raid_map[i+1] = l; 4897 raid_map[i+1] = l;
4898
4899 if (bbio->tgtdev_map) {
4900 m = bbio->tgtdev_map[i];
4901 bbio->tgtdev_map[i] =
4902 bbio->tgtdev_map[i + 1];
4903 bbio->tgtdev_map[i + 1] = m;
4904 }
4905
4898 again = 1; 4906 again = 1;
4899 } 4907 }
4900 } 4908 }
@@ -4923,6 +4931,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4923 int ret = 0; 4931 int ret = 0;
4924 int num_stripes; 4932 int num_stripes;
4925 int max_errors = 0; 4933 int max_errors = 0;
4934 int tgtdev_indexes = 0;
4926 struct btrfs_bio *bbio = NULL; 4935 struct btrfs_bio *bbio = NULL;
4927 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 4936 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4928 int dev_replace_is_ongoing = 0; 4937 int dev_replace_is_ongoing = 0;
@@ -5161,15 +5170,14 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5161 BTRFS_BLOCK_GROUP_RAID6)) { 5170 BTRFS_BLOCK_GROUP_RAID6)) {
5162 u64 tmp; 5171 u64 tmp;
5163 5172
5164 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) 5173 if (raid_map_ret &&
5165 && raid_map_ret) { 5174 ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
5175 mirror_num > 1)) {
5166 int i, rot; 5176 int i, rot;
5167 5177
5168 /* push stripe_nr back to the start of the full stripe */ 5178 /* push stripe_nr back to the start of the full stripe */
5169 stripe_nr = raid56_full_stripe_start; 5179 stripe_nr = raid56_full_stripe_start;
5170 do_div(stripe_nr, stripe_len); 5180 do_div(stripe_nr, stripe_len * nr_data_stripes(map));
5171
5172 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
5173 5181
5174 /* RAID[56] write or recovery. Return all stripes */ 5182 /* RAID[56] write or recovery. Return all stripes */
5175 num_stripes = map->num_stripes; 5183 num_stripes = map->num_stripes;
@@ -5235,14 +5243,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5235 num_alloc_stripes <<= 1; 5243 num_alloc_stripes <<= 1;
5236 if (rw & REQ_GET_READ_MIRRORS) 5244 if (rw & REQ_GET_READ_MIRRORS)
5237 num_alloc_stripes++; 5245 num_alloc_stripes++;
5246 tgtdev_indexes = num_stripes;
5238 } 5247 }
5239 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); 5248
5249 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
5250 GFP_NOFS);
5240 if (!bbio) { 5251 if (!bbio) {
5241 kfree(raid_map); 5252 kfree(raid_map);
5242 ret = -ENOMEM; 5253 ret = -ENOMEM;
5243 goto out; 5254 goto out;
5244 } 5255 }
5245 atomic_set(&bbio->error, 0); 5256 atomic_set(&bbio->error, 0);
5257 if (dev_replace_is_ongoing)
5258 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
5246 5259
5247 if (rw & REQ_DISCARD) { 5260 if (rw & REQ_DISCARD) {
5248 int factor = 0; 5261 int factor = 0;
@@ -5327,6 +5340,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5327 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 5340 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5328 max_errors = btrfs_chunk_max_errors(map); 5341 max_errors = btrfs_chunk_max_errors(map);
5329 5342
5343 tgtdev_indexes = 0;
5330 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 5344 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
5331 dev_replace->tgtdev != NULL) { 5345 dev_replace->tgtdev != NULL) {
5332 int index_where_to_add; 5346 int index_where_to_add;
@@ -5355,8 +5369,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5355 new->physical = old->physical; 5369 new->physical = old->physical;
5356 new->length = old->length; 5370 new->length = old->length;
5357 new->dev = dev_replace->tgtdev; 5371 new->dev = dev_replace->tgtdev;
5372 bbio->tgtdev_map[i] = index_where_to_add;
5358 index_where_to_add++; 5373 index_where_to_add++;
5359 max_errors++; 5374 max_errors++;
5375 tgtdev_indexes++;
5360 } 5376 }
5361 } 5377 }
5362 num_stripes = index_where_to_add; 5378 num_stripes = index_where_to_add;
@@ -5402,7 +5418,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5402 tgtdev_stripe->length = 5418 tgtdev_stripe->length =
5403 bbio->stripes[index_srcdev].length; 5419 bbio->stripes[index_srcdev].length;
5404 tgtdev_stripe->dev = dev_replace->tgtdev; 5420 tgtdev_stripe->dev = dev_replace->tgtdev;
5421 bbio->tgtdev_map[index_srcdev] = num_stripes;
5405 5422
5423 tgtdev_indexes++;
5406 num_stripes++; 5424 num_stripes++;
5407 } 5425 }
5408 } 5426 }
@@ -5412,6 +5430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5412 bbio->num_stripes = num_stripes; 5430 bbio->num_stripes = num_stripes;
5413 bbio->max_errors = max_errors; 5431 bbio->max_errors = max_errors;
5414 bbio->mirror_num = mirror_num; 5432 bbio->mirror_num = mirror_num;
5433 bbio->num_tgtdevs = tgtdev_indexes;
5415 5434
5416 /* 5435 /*
5417 * this is the case that REQ_READ && dev_replace_is_ongoing && 5436 * this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -5443,6 +5462,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5443 mirror_num, NULL); 5462 mirror_num, NULL);
5444} 5463}
5445 5464
5465/* For Scrub/replace */
5466int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
5467 u64 logical, u64 *length,
5468 struct btrfs_bio **bbio_ret, int mirror_num,
5469 u64 **raid_map_ret)
5470{
5471 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
5472 mirror_num, raid_map_ret);
5473}
5474
5446int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 5475int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5447 u64 chunk_start, u64 physical, u64 devid, 5476 u64 chunk_start, u64 physical, u64 devid,
5448 u64 **logical, int *naddrs, int *stripe_len) 5477 u64 **logical, int *naddrs, int *stripe_len)
@@ -5812,12 +5841,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5812 } else { 5841 } else {
5813 ret = raid56_parity_recover(root, bio, bbio, 5842 ret = raid56_parity_recover(root, bio, bbio,
5814 raid_map, map_length, 5843 raid_map, map_length,
5815 mirror_num); 5844 mirror_num, 1);
5816 } 5845 }
5817 /* 5846
5818 * FIXME, replace dosen't support raid56 yet, please fix
5819 * it in the future.
5820 */
5821 btrfs_bio_counter_dec(root->fs_info); 5847 btrfs_bio_counter_dec(root->fs_info);
5822 return ret; 5848 return ret;
5823 } 5849 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 08980fa23039..d6fe73c0f4a2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -292,7 +292,7 @@ struct btrfs_bio_stripe {
292struct btrfs_bio; 292struct btrfs_bio;
293typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); 293typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
294 294
295#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1 295#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0)
296 296
297struct btrfs_bio { 297struct btrfs_bio {
298 atomic_t stripes_pending; 298 atomic_t stripes_pending;
@@ -305,6 +305,8 @@ struct btrfs_bio {
305 int max_errors; 305 int max_errors;
306 int num_stripes; 306 int num_stripes;
307 int mirror_num; 307 int mirror_num;
308 int num_tgtdevs;
309 int *tgtdev_map;
308 struct btrfs_bio_stripe stripes[]; 310 struct btrfs_bio_stripe stripes[];
309}; 311};
310 312
@@ -387,12 +389,18 @@ struct btrfs_balance_control {
387int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 389int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
388 u64 end, u64 *length); 390 u64 end, u64 *length);
389 391
390#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ 392#define btrfs_bio_size(total_stripes, real_stripes) \
391 (sizeof(struct btrfs_bio_stripe) * (n))) 393 (sizeof(struct btrfs_bio) + \
394 (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \
395 (sizeof(int) * (real_stripes)))
392 396
393int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 397int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
394 u64 logical, u64 *length, 398 u64 logical, u64 *length,
395 struct btrfs_bio **bbio_ret, int mirror_num); 399 struct btrfs_bio **bbio_ret, int mirror_num);
400int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
401 u64 logical, u64 *length,
402 struct btrfs_bio **bbio_ret, int mirror_num,
403 u64 **raid_map_ret);
396int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 404int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
397 u64 chunk_start, u64 physical, u64 devid, 405 u64 chunk_start, u64 physical, u64 devid,
398 u64 **logical, int *naddrs, int *stripe_len); 406 u64 **logical, int *naddrs, int *stripe_len);
@@ -448,8 +456,10 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
448int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 456int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
449int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 457int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
450 struct btrfs_fs_info *fs_info); 458 struct btrfs_fs_info *fs_info);
451void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 459void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
452 struct btrfs_device *srcdev); 460 struct btrfs_device *srcdev);
461void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
462 struct btrfs_device *srcdev);
453void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 463void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
454 struct btrfs_device *tgtdev); 464 struct btrfs_device *tgtdev);
455void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 465void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
@@ -513,4 +523,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
513void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); 523void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
514void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, 524void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
515 struct btrfs_transaction *transaction); 525 struct btrfs_transaction *transaction);
526
527static inline void lock_chunks(struct btrfs_root *root)
528{
529 mutex_lock(&root->fs_info->chunk_mutex);
530}
531
532static inline void unlock_chunks(struct btrfs_root *root)
533{
534 mutex_unlock(&root->fs_info->chunk_mutex);
535}
536
537
516#endif 538#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index dcf20131fbe4..47b19465f0dc 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -29,6 +29,7 @@
29#include "xattr.h" 29#include "xattr.h"
30#include "disk-io.h" 30#include "disk-io.h"
31#include "props.h" 31#include "props.h"
32#include "locking.h"
32 33
33 34
34ssize_t __btrfs_getxattr(struct inode *inode, const char *name, 35ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
@@ -91,7 +92,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
91 struct inode *inode, const char *name, 92 struct inode *inode, const char *name,
92 const void *value, size_t size, int flags) 93 const void *value, size_t size, int flags)
93{ 94{
94 struct btrfs_dir_item *di; 95 struct btrfs_dir_item *di = NULL;
95 struct btrfs_root *root = BTRFS_I(inode)->root; 96 struct btrfs_root *root = BTRFS_I(inode)->root;
96 struct btrfs_path *path; 97 struct btrfs_path *path;
97 size_t name_len = strlen(name); 98 size_t name_len = strlen(name);
@@ -103,84 +104,119 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
103 path = btrfs_alloc_path(); 104 path = btrfs_alloc_path();
104 if (!path) 105 if (!path)
105 return -ENOMEM; 106 return -ENOMEM;
107 path->skip_release_on_error = 1;
108
109 if (!value) {
110 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
111 name, name_len, -1);
112 if (!di && (flags & XATTR_REPLACE))
113 ret = -ENODATA;
114 else if (di)
115 ret = btrfs_delete_one_dir_name(trans, root, path, di);
116 goto out;
117 }
106 118
119 /*
120 * For a replace we can't just do the insert blindly.
121 * Do a lookup first (read-only btrfs_search_slot), and return if xattr
122 * doesn't exist. If it exists, fall down below to the insert/replace
123 * path - we can't race with a concurrent xattr delete, because the VFS
124 * locks the inode's i_mutex before calling setxattr or removexattr.
125 */
107 if (flags & XATTR_REPLACE) { 126 if (flags & XATTR_REPLACE) {
108 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, 127 ASSERT(mutex_is_locked(&inode->i_mutex));
109 name_len, -1); 128 di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
110 if (IS_ERR(di)) { 129 name, name_len, 0);
111 ret = PTR_ERR(di); 130 if (!di) {
112 goto out;
113 } else if (!di) {
114 ret = -ENODATA; 131 ret = -ENODATA;
115 goto out; 132 goto out;
116 } 133 }
117 ret = btrfs_delete_one_dir_name(trans, root, path, di);
118 if (ret)
119 goto out;
120 btrfs_release_path(path); 134 btrfs_release_path(path);
135 di = NULL;
136 }
121 137
138 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
139 name, name_len, value, size);
140 if (ret == -EOVERFLOW) {
122 /* 141 /*
123 * remove the attribute 142 * We have an existing item in a leaf, split_leaf couldn't
143 * expand it. That item might have or not a dir_item that
144 * matches our target xattr, so lets check.
124 */ 145 */
125 if (!value) 146 ret = 0;
126 goto out; 147 btrfs_assert_tree_locked(path->nodes[0]);
127 } else { 148 di = btrfs_match_dir_item_name(root, path, name, name_len);
128 di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), 149 if (!di && !(flags & XATTR_REPLACE)) {
129 name, name_len, 0); 150 ret = -ENOSPC;
130 if (IS_ERR(di)) {
131 ret = PTR_ERR(di);
132 goto out; 151 goto out;
133 } 152 }
134 if (!di && !value) 153 } else if (ret == -EEXIST) {
135 goto out; 154 ret = 0;
136 btrfs_release_path(path); 155 di = btrfs_match_dir_item_name(root, path, name, name_len);
156 ASSERT(di); /* logic error */
157 } else if (ret) {
158 goto out;
137 } 159 }
138 160
139again: 161 if (di && (flags & XATTR_CREATE)) {
140 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
141 name, name_len, value, size);
142 /*
143 * If we're setting an xattr to a new value but the new value is say
144 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
145 * back from split_leaf. This is because it thinks we'll be extending
146 * the existing item size, but we're asking for enough space to add the
147 * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
148 * the rest of the function figure it out.
149 */
150 if (ret == -EOVERFLOW)
151 ret = -EEXIST; 162 ret = -EEXIST;
163 goto out;
164 }
152 165
153 if (ret == -EEXIST) { 166 if (di) {
154 if (flags & XATTR_CREATE)
155 goto out;
156 /* 167 /*
157 * We can't use the path we already have since we won't have the 168 * We're doing a replace, and it must be atomic, that is, at
158 * proper locking for a delete, so release the path and 169 * any point in time we have either the old or the new xattr
159 * re-lookup to delete the thing. 170 * value in the tree. We don't want readers (getxattr and
171 * listxattrs) to miss a value, this is specially important
172 * for ACLs.
160 */ 173 */
161 btrfs_release_path(path); 174 const int slot = path->slots[0];
162 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), 175 struct extent_buffer *leaf = path->nodes[0];
163 name, name_len, -1); 176 const u16 old_data_len = btrfs_dir_data_len(leaf, di);
164 if (IS_ERR(di)) { 177 const u32 item_size = btrfs_item_size_nr(leaf, slot);
165 ret = PTR_ERR(di); 178 const u32 data_size = sizeof(*di) + name_len + size;
166 goto out; 179 struct btrfs_item *item;
167 } else if (!di) { 180 unsigned long data_ptr;
168 /* Shouldn't happen but just in case... */ 181 char *ptr;
169 btrfs_release_path(path); 182
170 goto again; 183 if (size > old_data_len) {
184 if (btrfs_leaf_free_space(root, leaf) <
185 (size - old_data_len)) {
186 ret = -ENOSPC;
187 goto out;
188 }
171 } 189 }
172 190
173 ret = btrfs_delete_one_dir_name(trans, root, path, di); 191 if (old_data_len + name_len + sizeof(*di) == item_size) {
174 if (ret) 192 /* No other xattrs packed in the same leaf item. */
175 goto out; 193 if (size > old_data_len)
194 btrfs_extend_item(root, path,
195 size - old_data_len);
196 else if (size < old_data_len)
197 btrfs_truncate_item(root, path, data_size, 1);
198 } else {
199 /* There are other xattrs packed in the same item. */
200 ret = btrfs_delete_one_dir_name(trans, root, path, di);
201 if (ret)
202 goto out;
203 btrfs_extend_item(root, path, data_size);
204 }
176 205
206 item = btrfs_item_nr(slot);
207 ptr = btrfs_item_ptr(leaf, slot, char);
208 ptr += btrfs_item_size(leaf, item) - data_size;
209 di = (struct btrfs_dir_item *)ptr;
210 btrfs_set_dir_data_len(leaf, di, size);
211 data_ptr = ((unsigned long)(di + 1)) + name_len;
212 write_extent_buffer(leaf, value, data_ptr, size);
213 btrfs_mark_buffer_dirty(leaf);
214 } else {
177 /* 215 /*
178 * We have a value to set, so go back and try to insert it now. 216 * Insert, and we had space for the xattr, so path->slots[0] is
217 * where our xattr dir_item is and btrfs_insert_xattr_item()
218 * filled it.
179 */ 219 */
180 if (value) {
181 btrfs_release_path(path);
182 goto again;
183 }
184 } 220 }
185out: 221out:
186 btrfs_free_path(path); 222 btrfs_free_path(path);