aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/dev-replace.h26
-rw-r--r--fs/btrfs/reada.c10
-rw-r--r--fs/btrfs/scrub.c883
-rw-r--r--fs/btrfs/super.c3
5 files changed, 851 insertions, 73 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746cb6aa1f62..ded7caa0d304 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1483,6 +1483,8 @@ struct btrfs_fs_info {
1483 struct rw_semaphore scrub_super_lock; 1483 struct rw_semaphore scrub_super_lock;
1484 int scrub_workers_refcnt; 1484 int scrub_workers_refcnt;
1485 struct btrfs_workers scrub_workers; 1485 struct btrfs_workers scrub_workers;
1486 struct btrfs_workers scrub_wr_completion_workers;
1487 struct btrfs_workers scrub_nocow_workers;
1486 1488
1487#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1489#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1488 u32 check_integrity_print_mask; 1490 u32 check_integrity_print_mask;
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000000..1fb5c89037ee
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,26 @@
1/*
2 * Copyright (C) STRATO AG 2012. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#if !defined(__BTRFS_DEV_REPLACE__)
20#define __BTRFS_DEV_REPLACE__
21
22static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
23{
24 atomic64_inc(stat_value);
25}
26#endif
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 0ddc5659f946..9f363e17ec74 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -418,12 +418,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
418 */ 418 */
419 continue; 419 continue;
420 } 420 }
421 if (!dev->bdev) {
422 /* cannot read ahead on missing device */
423 continue;
424 }
421 prev_dev = dev; 425 prev_dev = dev;
422 ret = radix_tree_insert(&dev->reada_extents, index, re); 426 ret = radix_tree_insert(&dev->reada_extents, index, re);
423 if (ret) { 427 if (ret) {
424 while (--i >= 0) { 428 while (--i >= 0) {
425 dev = bbio->stripes[i].dev; 429 dev = bbio->stripes[i].dev;
426 BUG_ON(dev == NULL); 430 BUG_ON(dev == NULL);
431 /* ignore whether the entry was inserted */
427 radix_tree_delete(&dev->reada_extents, index); 432 radix_tree_delete(&dev->reada_extents, index);
428 } 433 }
429 BUG_ON(fs_info == NULL); 434 BUG_ON(fs_info == NULL);
@@ -914,7 +919,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
914 generation = btrfs_header_generation(node); 919 generation = btrfs_header_generation(node);
915 free_extent_buffer(node); 920 free_extent_buffer(node);
916 921
917 reada_add_block(rc, start, &max_key, level, generation); 922 if (reada_add_block(rc, start, &max_key, level, generation)) {
923 kfree(rc);
924 return ERR_PTR(-ENOMEM);
925 }
918 926
919 reada_start_machine(root->fs_info); 927 reada_start_machine(root->fs_info);
920 928
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 460e30bb1884..61157a26cf2a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "backref.h" 26#include "backref.h"
27#include "extent_io.h" 27#include "extent_io.h"
28#include "dev-replace.h"
28#include "check-integrity.h" 29#include "check-integrity.h"
29#include "rcu-string.h" 30#include "rcu-string.h"
30 31
@@ -44,8 +45,15 @@
44struct scrub_block; 45struct scrub_block;
45struct scrub_ctx; 46struct scrub_ctx;
46 47
47#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 48/*
48#define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */ 49 * the following three values only influence the performance.
50 * The last one configures the number of parallel and outstanding I/O
51 * operations. The first two values configure an upper limit for the number
52 * of (dynamically allocated) pages that are added to a bio.
53 */
54#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
55#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
56#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
49 57
50/* 58/*
51 * the following value times PAGE_SIZE needs to be large enough to match the 59 * the following value times PAGE_SIZE needs to be large enough to match the
@@ -62,6 +70,7 @@ struct scrub_page {
62 u64 generation; 70 u64 generation;
63 u64 logical; 71 u64 logical;
64 u64 physical; 72 u64 physical;
73 u64 physical_for_dev_replace;
65 atomic_t ref_count; 74 atomic_t ref_count;
66 struct { 75 struct {
67 unsigned int mirror_num:8; 76 unsigned int mirror_num:8;
@@ -79,7 +88,11 @@ struct scrub_bio {
79 int err; 88 int err;
80 u64 logical; 89 u64 logical;
81 u64 physical; 90 u64 physical;
82 struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; 91#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
93#else
94 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
95#endif
83 int page_count; 96 int page_count;
84 int next_free; 97 int next_free;
85 struct btrfs_work work; 98 struct btrfs_work work;
@@ -99,8 +112,16 @@ struct scrub_block {
99 }; 112 };
100}; 113};
101 114
115struct scrub_wr_ctx {
116 struct scrub_bio *wr_curr_bio;
117 struct btrfs_device *tgtdev;
118 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
119 atomic_t flush_all_writes;
120 struct mutex wr_lock;
121};
122
102struct scrub_ctx { 123struct scrub_ctx {
103 struct scrub_bio *bios[SCRUB_BIOS_PER_CTX]; 124 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
104 struct btrfs_root *dev_root; 125 struct btrfs_root *dev_root;
105 int first_free; 126 int first_free;
106 int curr; 127 int curr;
@@ -112,12 +133,13 @@ struct scrub_ctx {
112 struct list_head csum_list; 133 struct list_head csum_list;
113 atomic_t cancel_req; 134 atomic_t cancel_req;
114 int readonly; 135 int readonly;
115 int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ 136 int pages_per_rd_bio;
116 u32 sectorsize; 137 u32 sectorsize;
117 u32 nodesize; 138 u32 nodesize;
118 u32 leafsize; 139 u32 leafsize;
119 140
120 int is_dev_replace; 141 int is_dev_replace;
142 struct scrub_wr_ctx wr_ctx;
121 143
122 /* 144 /*
123 * statistics 145 * statistics
@@ -135,6 +157,15 @@ struct scrub_fixup_nodatasum {
135 int mirror_num; 157 int mirror_num;
136}; 158};
137 159
160struct scrub_copy_nocow_ctx {
161 struct scrub_ctx *sctx;
162 u64 logical;
163 u64 len;
164 int mirror_num;
165 u64 physical_for_dev_replace;
166 struct btrfs_work work;
167};
168
138struct scrub_warning { 169struct scrub_warning {
139 struct btrfs_path *path; 170 struct btrfs_path *path;
140 u64 extent_item_size; 171 u64 extent_item_size;
@@ -156,8 +187,9 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
156static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 187static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
157static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 188static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
158 struct btrfs_fs_info *fs_info, 189 struct btrfs_fs_info *fs_info,
190 struct scrub_block *original_sblock,
159 u64 length, u64 logical, 191 u64 length, u64 logical,
160 struct scrub_block *sblock); 192 struct scrub_block *sblocks_for_recheck);
161static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 193static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
162 struct scrub_block *sblock, int is_metadata, 194 struct scrub_block *sblock, int is_metadata,
163 int have_csum, u8 *csum, u64 generation, 195 int have_csum, u8 *csum, u64 generation,
@@ -174,6 +206,9 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
174static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 206static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
175 struct scrub_block *sblock_good, 207 struct scrub_block *sblock_good,
176 int page_num, int force_write); 208 int page_num, int force_write);
209static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
210static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
211 int page_num);
177static int scrub_checksum_data(struct scrub_block *sblock); 212static int scrub_checksum_data(struct scrub_block *sblock);
178static int scrub_checksum_tree_block(struct scrub_block *sblock); 213static int scrub_checksum_tree_block(struct scrub_block *sblock);
179static int scrub_checksum_super(struct scrub_block *sblock); 214static int scrub_checksum_super(struct scrub_block *sblock);
@@ -181,14 +216,38 @@ static void scrub_block_get(struct scrub_block *sblock);
181static void scrub_block_put(struct scrub_block *sblock); 216static void scrub_block_put(struct scrub_block *sblock);
182static void scrub_page_get(struct scrub_page *spage); 217static void scrub_page_get(struct scrub_page *spage);
183static void scrub_page_put(struct scrub_page *spage); 218static void scrub_page_put(struct scrub_page *spage);
184static int scrub_add_page_to_bio(struct scrub_ctx *sctx, 219static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
185 struct scrub_page *spage); 220 struct scrub_page *spage);
186static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 221static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
187 u64 physical, struct btrfs_device *dev, u64 flags, 222 u64 physical, struct btrfs_device *dev, u64 flags,
188 u64 gen, int mirror_num, u8 *csum, int force); 223 u64 gen, int mirror_num, u8 *csum, int force,
224 u64 physical_for_dev_replace);
189static void scrub_bio_end_io(struct bio *bio, int err); 225static void scrub_bio_end_io(struct bio *bio, int err);
190static void scrub_bio_end_io_worker(struct btrfs_work *work); 226static void scrub_bio_end_io_worker(struct btrfs_work *work);
191static void scrub_block_complete(struct scrub_block *sblock); 227static void scrub_block_complete(struct scrub_block *sblock);
228static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
229 u64 extent_logical, u64 extent_len,
230 u64 *extent_physical,
231 struct btrfs_device **extent_dev,
232 int *extent_mirror_num);
233static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
234 struct scrub_wr_ctx *wr_ctx,
235 struct btrfs_fs_info *fs_info,
236 struct btrfs_device *dev,
237 int is_dev_replace);
238static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
239static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
240 struct scrub_page *spage);
241static void scrub_wr_submit(struct scrub_ctx *sctx);
242static void scrub_wr_bio_end_io(struct bio *bio, int err);
243static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
244static int write_page_nocow(struct scrub_ctx *sctx,
245 u64 physical_for_dev_replace, struct page *page);
246static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
247 void *ctx);
248static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
249 int mirror_num, u64 physical_for_dev_replace);
250static void copy_nocow_pages_worker(struct btrfs_work *work);
192 251
193 252
194static void scrub_pending_bio_inc(struct scrub_ctx *sctx) 253static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -262,19 +321,20 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
262 if (!sctx) 321 if (!sctx)
263 return; 322 return;
264 323
324 scrub_free_wr_ctx(&sctx->wr_ctx);
325
265 /* this can happen when scrub is cancelled */ 326 /* this can happen when scrub is cancelled */
266 if (sctx->curr != -1) { 327 if (sctx->curr != -1) {
267 struct scrub_bio *sbio = sctx->bios[sctx->curr]; 328 struct scrub_bio *sbio = sctx->bios[sctx->curr];
268 329
269 for (i = 0; i < sbio->page_count; i++) { 330 for (i = 0; i < sbio->page_count; i++) {
270 BUG_ON(!sbio->pagev[i]); 331 WARN_ON(!sbio->pagev[i]->page);
271 BUG_ON(!sbio->pagev[i]->page);
272 scrub_block_put(sbio->pagev[i]->sblock); 332 scrub_block_put(sbio->pagev[i]->sblock);
273 } 333 }
274 bio_put(sbio->bio); 334 bio_put(sbio->bio);
275 } 335 }
276 336
277 for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { 337 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
278 struct scrub_bio *sbio = sctx->bios[i]; 338 struct scrub_bio *sbio = sctx->bios[i];
279 339
280 if (!sbio) 340 if (!sbio)
@@ -292,18 +352,29 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
292 struct scrub_ctx *sctx; 352 struct scrub_ctx *sctx;
293 int i; 353 int i;
294 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 354 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
295 int pages_per_bio; 355 int pages_per_rd_bio;
356 int ret;
296 357
297 pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, 358 /*
298 bio_get_nr_vecs(dev->bdev)); 359 * the setting of pages_per_rd_bio is correct for scrub but might
360 * be wrong for the dev_replace code where we might read from
361 * different devices in the initial huge bios. However, that
362 * code is able to correctly handle the case when adding a page
363 * to a bio fails.
364 */
365 if (dev->bdev)
366 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
367 bio_get_nr_vecs(dev->bdev));
368 else
369 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
299 sctx = kzalloc(sizeof(*sctx), GFP_NOFS); 370 sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
300 if (!sctx) 371 if (!sctx)
301 goto nomem; 372 goto nomem;
302 sctx->is_dev_replace = is_dev_replace; 373 sctx->is_dev_replace = is_dev_replace;
303 sctx->pages_per_bio = pages_per_bio; 374 sctx->pages_per_rd_bio = pages_per_rd_bio;
304 sctx->curr = -1; 375 sctx->curr = -1;
305 sctx->dev_root = dev->dev_root; 376 sctx->dev_root = dev->dev_root;
306 for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { 377 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
307 struct scrub_bio *sbio; 378 struct scrub_bio *sbio;
308 379
309 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 380 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
@@ -316,7 +387,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
316 sbio->page_count = 0; 387 sbio->page_count = 0;
317 sbio->work.func = scrub_bio_end_io_worker; 388 sbio->work.func = scrub_bio_end_io_worker;
318 389
319 if (i != SCRUB_BIOS_PER_CTX - 1) 390 if (i != SCRUB_BIOS_PER_SCTX - 1)
320 sctx->bios[i]->next_free = i + 1; 391 sctx->bios[i]->next_free = i + 1;
321 else 392 else
322 sctx->bios[i]->next_free = -1; 393 sctx->bios[i]->next_free = -1;
@@ -334,6 +405,13 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
334 spin_lock_init(&sctx->list_lock); 405 spin_lock_init(&sctx->list_lock);
335 spin_lock_init(&sctx->stat_lock); 406 spin_lock_init(&sctx->stat_lock);
336 init_waitqueue_head(&sctx->list_wait); 407 init_waitqueue_head(&sctx->list_wait);
408
409 ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
410 fs_info->dev_replace.tgtdev, is_dev_replace);
411 if (ret) {
412 scrub_free_ctx(sctx);
413 return ERR_PTR(ret);
414 }
337 return sctx; 415 return sctx;
338 416
339nomem: 417nomem:
@@ -341,7 +419,8 @@ nomem:
341 return ERR_PTR(-ENOMEM); 419 return ERR_PTR(-ENOMEM);
342} 420}
343 421
344static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) 422static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
423 void *warn_ctx)
345{ 424{
346 u64 isize; 425 u64 isize;
347 u32 nlink; 426 u32 nlink;
@@ -349,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
349 int i; 428 int i;
350 struct extent_buffer *eb; 429 struct extent_buffer *eb;
351 struct btrfs_inode_item *inode_item; 430 struct btrfs_inode_item *inode_item;
352 struct scrub_warning *swarn = ctx; 431 struct scrub_warning *swarn = warn_ctx;
353 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 432 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
354 struct inode_fs_paths *ipath = NULL; 433 struct inode_fs_paths *ipath = NULL;
355 struct btrfs_root *local_root; 434 struct btrfs_root *local_root;
@@ -492,11 +571,11 @@ out:
492 kfree(swarn.msg_buf); 571 kfree(swarn.msg_buf);
493} 572}
494 573
495static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) 574static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
496{ 575{
497 struct page *page = NULL; 576 struct page *page = NULL;
498 unsigned long index; 577 unsigned long index;
499 struct scrub_fixup_nodatasum *fixup = ctx; 578 struct scrub_fixup_nodatasum *fixup = fixup_ctx;
500 int ret; 579 int ret;
501 int corrected = 0; 580 int corrected = 0;
502 struct btrfs_key key; 581 struct btrfs_key key;
@@ -660,7 +739,9 @@ out:
660 spin_lock(&sctx->stat_lock); 739 spin_lock(&sctx->stat_lock);
661 ++sctx->stat.uncorrectable_errors; 740 ++sctx->stat.uncorrectable_errors;
662 spin_unlock(&sctx->stat_lock); 741 spin_unlock(&sctx->stat_lock);
663 742 btrfs_dev_replace_stats_inc(
743 &sctx->dev_root->fs_info->dev_replace.
744 num_uncorrectable_read_errors);
664 printk_ratelimited_in_rcu(KERN_ERR 745 printk_ratelimited_in_rcu(KERN_ERR
665 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 746 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
666 (unsigned long long)fixup->logical, 747 (unsigned long long)fixup->logical,
@@ -715,6 +796,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
715 csum = sblock_to_check->pagev[0]->csum; 796 csum = sblock_to_check->pagev[0]->csum;
716 dev = sblock_to_check->pagev[0]->dev; 797 dev = sblock_to_check->pagev[0]->dev;
717 798
799 if (sctx->is_dev_replace && !is_metadata && !have_csum) {
800 sblocks_for_recheck = NULL;
801 goto nodatasum_case;
802 }
803
718 /* 804 /*
719 * read all mirrors one after the other. This includes to 805 * read all mirrors one after the other. This includes to
720 * re-read the extent or metadata block that failed (that was 806 * re-read the extent or metadata block that failed (that was
@@ -758,7 +844,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
758 } 844 }
759 845
760 /* setup the context, map the logical blocks and alloc the pages */ 846 /* setup the context, map the logical blocks and alloc the pages */
761 ret = scrub_setup_recheck_block(sctx, fs_info, length, 847 ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
762 logical, sblocks_for_recheck); 848 logical, sblocks_for_recheck);
763 if (ret) { 849 if (ret) {
764 spin_lock(&sctx->stat_lock); 850 spin_lock(&sctx->stat_lock);
@@ -789,6 +875,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
789 sctx->stat.unverified_errors++; 875 sctx->stat.unverified_errors++;
790 spin_unlock(&sctx->stat_lock); 876 spin_unlock(&sctx->stat_lock);
791 877
878 if (sctx->is_dev_replace)
879 scrub_write_block_to_dev_replace(sblock_bad);
792 goto out; 880 goto out;
793 } 881 }
794 882
@@ -822,12 +910,15 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
822 BTRFS_DEV_STAT_CORRUPTION_ERRS); 910 BTRFS_DEV_STAT_CORRUPTION_ERRS);
823 } 911 }
824 912
825 if (sctx->readonly) 913 if (sctx->readonly && !sctx->is_dev_replace)
826 goto did_not_correct_error; 914 goto did_not_correct_error;
827 915
828 if (!is_metadata && !have_csum) { 916 if (!is_metadata && !have_csum) {
829 struct scrub_fixup_nodatasum *fixup_nodatasum; 917 struct scrub_fixup_nodatasum *fixup_nodatasum;
830 918
919nodatasum_case:
920 WARN_ON(sctx->is_dev_replace);
921
831 /* 922 /*
832 * !is_metadata and !have_csum, this means that the data 923 * !is_metadata and !have_csum, this means that the data
833 * might not be COW'ed, that it might be modified 924 * might not be COW'ed, that it might be modified
@@ -883,18 +974,79 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
883 if (!sblock_other->header_error && 974 if (!sblock_other->header_error &&
884 !sblock_other->checksum_error && 975 !sblock_other->checksum_error &&
885 sblock_other->no_io_error_seen) { 976 sblock_other->no_io_error_seen) {
886 int force_write = is_metadata || have_csum; 977 if (sctx->is_dev_replace) {
887 978 scrub_write_block_to_dev_replace(sblock_other);
888 ret = scrub_repair_block_from_good_copy(sblock_bad, 979 } else {
889 sblock_other, 980 int force_write = is_metadata || have_csum;
890 force_write); 981
982 ret = scrub_repair_block_from_good_copy(
983 sblock_bad, sblock_other,
984 force_write);
985 }
891 if (0 == ret) 986 if (0 == ret)
892 goto corrected_error; 987 goto corrected_error;
893 } 988 }
894 } 989 }
895 990
896 /* 991 /*
897 * in case of I/O errors in the area that is supposed to be 992 * for dev_replace, pick good pages and write to the target device.
993 */
994 if (sctx->is_dev_replace) {
995 success = 1;
996 for (page_num = 0; page_num < sblock_bad->page_count;
997 page_num++) {
998 int sub_success;
999
1000 sub_success = 0;
1001 for (mirror_index = 0;
1002 mirror_index < BTRFS_MAX_MIRRORS &&
1003 sblocks_for_recheck[mirror_index].page_count > 0;
1004 mirror_index++) {
1005 struct scrub_block *sblock_other =
1006 sblocks_for_recheck + mirror_index;
1007 struct scrub_page *page_other =
1008 sblock_other->pagev[page_num];
1009
1010 if (!page_other->io_error) {
1011 ret = scrub_write_page_to_dev_replace(
1012 sblock_other, page_num);
1013 if (ret == 0) {
1014 /* succeeded for this page */
1015 sub_success = 1;
1016 break;
1017 } else {
1018 btrfs_dev_replace_stats_inc(
1019 &sctx->dev_root->
1020 fs_info->dev_replace.
1021 num_write_errors);
1022 }
1023 }
1024 }
1025
1026 if (!sub_success) {
1027 /*
1028 * did not find a mirror to fetch the page
1029 * from. scrub_write_page_to_dev_replace()
1030 * handles this case (page->io_error), by
1031 * filling the block with zeros before
1032 * submitting the write request
1033 */
1034 success = 0;
1035 ret = scrub_write_page_to_dev_replace(
1036 sblock_bad, page_num);
1037 if (ret)
1038 btrfs_dev_replace_stats_inc(
1039 &sctx->dev_root->fs_info->
1040 dev_replace.num_write_errors);
1041 }
1042 }
1043
1044 goto out;
1045 }
1046
1047 /*
1048 * for regular scrub, repair those pages that are errored.
1049 * In case of I/O errors in the area that is supposed to be
898 * repaired, continue by picking good copies of those pages. 1050 * repaired, continue by picking good copies of those pages.
899 * Select the good pages from mirrors to rewrite bad pages from 1051 * Select the good pages from mirrors to rewrite bad pages from
900 * the area to fix. Afterwards verify the checksum of the block 1052 * the area to fix. Afterwards verify the checksum of the block
@@ -1017,6 +1169,7 @@ out:
1017 1169
1018static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 1170static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1019 struct btrfs_fs_info *fs_info, 1171 struct btrfs_fs_info *fs_info,
1172 struct scrub_block *original_sblock,
1020 u64 length, u64 logical, 1173 u64 length, u64 logical,
1021 struct scrub_block *sblocks_for_recheck) 1174 struct scrub_block *sblocks_for_recheck)
1022{ 1175{
@@ -1047,7 +1200,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1047 return -EIO; 1200 return -EIO;
1048 } 1201 }
1049 1202
1050 BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); 1203 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1051 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1204 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1052 mirror_index++) { 1205 mirror_index++) {
1053 struct scrub_block *sblock; 1206 struct scrub_block *sblock;
@@ -1071,6 +1224,10 @@ leave_nomem:
1071 sblock->pagev[page_index] = page; 1224 sblock->pagev[page_index] = page;
1072 page->logical = logical; 1225 page->logical = logical;
1073 page->physical = bbio->stripes[mirror_index].physical; 1226 page->physical = bbio->stripes[mirror_index].physical;
1227 BUG_ON(page_index >= original_sblock->page_count);
1228 page->physical_for_dev_replace =
1229 original_sblock->pagev[page_index]->
1230 physical_for_dev_replace;
1074 /* for missing devices, dev->bdev is NULL */ 1231 /* for missing devices, dev->bdev is NULL */
1075 page->dev = bbio->stripes[mirror_index].dev; 1232 page->dev = bbio->stripes[mirror_index].dev;
1076 page->mirror_num = mirror_index + 1; 1233 page->mirror_num = mirror_index + 1;
@@ -1249,6 +1406,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1249 int ret; 1406 int ret;
1250 DECLARE_COMPLETION_ONSTACK(complete); 1407 DECLARE_COMPLETION_ONSTACK(complete);
1251 1408
1409 if (!page_bad->dev->bdev) {
1410 printk_ratelimited(KERN_WARNING
1411 "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1412 return -EIO;
1413 }
1414
1252 bio = bio_alloc(GFP_NOFS, 1); 1415 bio = bio_alloc(GFP_NOFS, 1);
1253 if (!bio) 1416 if (!bio)
1254 return -EIO; 1417 return -EIO;
@@ -1269,6 +1432,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1269 if (!bio_flagged(bio, BIO_UPTODATE)) { 1432 if (!bio_flagged(bio, BIO_UPTODATE)) {
1270 btrfs_dev_stat_inc_and_print(page_bad->dev, 1433 btrfs_dev_stat_inc_and_print(page_bad->dev,
1271 BTRFS_DEV_STAT_WRITE_ERRS); 1434 BTRFS_DEV_STAT_WRITE_ERRS);
1435 btrfs_dev_replace_stats_inc(
1436 &sblock_bad->sctx->dev_root->fs_info->
1437 dev_replace.num_write_errors);
1272 bio_put(bio); 1438 bio_put(bio);
1273 return -EIO; 1439 return -EIO;
1274 } 1440 }
@@ -1278,7 +1444,168 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1278 return 0; 1444 return 0;
1279} 1445}
1280 1446
1281static void scrub_checksum(struct scrub_block *sblock) 1447static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1448{
1449 int page_num;
1450
1451 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1452 int ret;
1453
1454 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1455 if (ret)
1456 btrfs_dev_replace_stats_inc(
1457 &sblock->sctx->dev_root->fs_info->dev_replace.
1458 num_write_errors);
1459 }
1460}
1461
1462static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1463 int page_num)
1464{
1465 struct scrub_page *spage = sblock->pagev[page_num];
1466
1467 BUG_ON(spage->page == NULL);
1468 if (spage->io_error) {
1469 void *mapped_buffer = kmap_atomic(spage->page);
1470
1471 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1472 flush_dcache_page(spage->page);
1473 kunmap_atomic(mapped_buffer);
1474 }
1475 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1476}
1477
1478static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1479 struct scrub_page *spage)
1480{
1481 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1482 struct scrub_bio *sbio;
1483 int ret;
1484
1485 mutex_lock(&wr_ctx->wr_lock);
1486again:
1487 if (!wr_ctx->wr_curr_bio) {
1488 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1489 GFP_NOFS);
1490 if (!wr_ctx->wr_curr_bio) {
1491 mutex_unlock(&wr_ctx->wr_lock);
1492 return -ENOMEM;
1493 }
1494 wr_ctx->wr_curr_bio->sctx = sctx;
1495 wr_ctx->wr_curr_bio->page_count = 0;
1496 }
1497 sbio = wr_ctx->wr_curr_bio;
1498 if (sbio->page_count == 0) {
1499 struct bio *bio;
1500
1501 sbio->physical = spage->physical_for_dev_replace;
1502 sbio->logical = spage->logical;
1503 sbio->dev = wr_ctx->tgtdev;
1504 bio = sbio->bio;
1505 if (!bio) {
1506 bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1507 if (!bio) {
1508 mutex_unlock(&wr_ctx->wr_lock);
1509 return -ENOMEM;
1510 }
1511 sbio->bio = bio;
1512 }
1513
1514 bio->bi_private = sbio;
1515 bio->bi_end_io = scrub_wr_bio_end_io;
1516 bio->bi_bdev = sbio->dev->bdev;
1517 bio->bi_sector = sbio->physical >> 9;
1518 sbio->err = 0;
1519 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1520 spage->physical_for_dev_replace ||
1521 sbio->logical + sbio->page_count * PAGE_SIZE !=
1522 spage->logical) {
1523 scrub_wr_submit(sctx);
1524 goto again;
1525 }
1526
1527 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1528 if (ret != PAGE_SIZE) {
1529 if (sbio->page_count < 1) {
1530 bio_put(sbio->bio);
1531 sbio->bio = NULL;
1532 mutex_unlock(&wr_ctx->wr_lock);
1533 return -EIO;
1534 }
1535 scrub_wr_submit(sctx);
1536 goto again;
1537 }
1538
1539 sbio->pagev[sbio->page_count] = spage;
1540 scrub_page_get(spage);
1541 sbio->page_count++;
1542 if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1543 scrub_wr_submit(sctx);
1544 mutex_unlock(&wr_ctx->wr_lock);
1545
1546 return 0;
1547}
1548
1549static void scrub_wr_submit(struct scrub_ctx *sctx)
1550{
1551 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1552 struct scrub_bio *sbio;
1553
1554 if (!wr_ctx->wr_curr_bio)
1555 return;
1556
1557 sbio = wr_ctx->wr_curr_bio;
1558 wr_ctx->wr_curr_bio = NULL;
1559 WARN_ON(!sbio->bio->bi_bdev);
1560 scrub_pending_bio_inc(sctx);
1561 /* process all writes in a single worker thread. Then the block layer
1562 * orders the requests before sending them to the driver which
1563 * doubled the write performance on spinning disks when measured
1564 * with Linux 3.5 */
1565 btrfsic_submit_bio(WRITE, sbio->bio);
1566}
1567
1568static void scrub_wr_bio_end_io(struct bio *bio, int err)
1569{
1570 struct scrub_bio *sbio = bio->bi_private;
1571 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1572
1573 sbio->err = err;
1574 sbio->bio = bio;
1575
1576 sbio->work.func = scrub_wr_bio_end_io_worker;
1577 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1578}
1579
1580static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1581{
1582 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1583 struct scrub_ctx *sctx = sbio->sctx;
1584 int i;
1585
1586 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1587 if (sbio->err) {
1588 struct btrfs_dev_replace *dev_replace =
1589 &sbio->sctx->dev_root->fs_info->dev_replace;
1590
1591 for (i = 0; i < sbio->page_count; i++) {
1592 struct scrub_page *spage = sbio->pagev[i];
1593
1594 spage->io_error = 1;
1595 btrfs_dev_replace_stats_inc(&dev_replace->
1596 num_write_errors);
1597 }
1598 }
1599
1600 for (i = 0; i < sbio->page_count; i++)
1601 scrub_page_put(sbio->pagev[i]);
1602
1603 bio_put(sbio->bio);
1604 kfree(sbio);
1605 scrub_pending_bio_dec(sctx);
1606}
1607
1608static int scrub_checksum(struct scrub_block *sblock)
1282{ 1609{
1283 u64 flags; 1610 u64 flags;
1284 int ret; 1611 int ret;
@@ -1296,6 +1623,8 @@ static void scrub_checksum(struct scrub_block *sblock)
1296 WARN_ON(1); 1623 WARN_ON(1);
1297 if (ret) 1624 if (ret)
1298 scrub_handle_errored_block(sblock); 1625 scrub_handle_errored_block(sblock);
1626
1627 return ret;
1299} 1628}
1300 1629
1301static int scrub_checksum_data(struct scrub_block *sblock) 1630static int scrub_checksum_data(struct scrub_block *sblock)
@@ -1386,7 +1715,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1386 BTRFS_UUID_SIZE)) 1715 BTRFS_UUID_SIZE))
1387 ++fail; 1716 ++fail;
1388 1717
1389 BUG_ON(sctx->nodesize != sctx->leafsize); 1718 WARN_ON(sctx->nodesize != sctx->leafsize);
1390 len = sctx->nodesize - BTRFS_CSUM_SIZE; 1719 len = sctx->nodesize - BTRFS_CSUM_SIZE;
1391 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1720 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1392 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1721 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
@@ -1534,11 +1863,24 @@ static void scrub_submit(struct scrub_ctx *sctx)
1534 sctx->curr = -1; 1863 sctx->curr = -1;
1535 scrub_pending_bio_inc(sctx); 1864 scrub_pending_bio_inc(sctx);
1536 1865
1537 btrfsic_submit_bio(READ, sbio->bio); 1866 if (!sbio->bio->bi_bdev) {
1867 /*
1868 * this case should not happen. If btrfs_map_block() is
1869 * wrong, it could happen for dev-replace operations on
1870 * missing devices when no mirrors are available, but in
1871 * this case it should already fail the mount.
1872 * This case is handled correctly (but _very_ slowly).
1873 */
1874 printk_ratelimited(KERN_WARNING
1875 "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1876 bio_endio(sbio->bio, -EIO);
1877 } else {
1878 btrfsic_submit_bio(READ, sbio->bio);
1879 }
1538} 1880}
1539 1881
1540static int scrub_add_page_to_bio(struct scrub_ctx *sctx, 1882static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1541 struct scrub_page *spage) 1883 struct scrub_page *spage)
1542{ 1884{
1543 struct scrub_block *sblock = spage->sblock; 1885 struct scrub_block *sblock = spage->sblock;
1544 struct scrub_bio *sbio; 1886 struct scrub_bio *sbio;
@@ -1570,7 +1912,7 @@ again:
1570 sbio->dev = spage->dev; 1912 sbio->dev = spage->dev;
1571 bio = sbio->bio; 1913 bio = sbio->bio;
1572 if (!bio) { 1914 if (!bio) {
1573 bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio); 1915 bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1574 if (!bio) 1916 if (!bio)
1575 return -ENOMEM; 1917 return -ENOMEM;
1576 sbio->bio = bio; 1918 sbio->bio = bio;
@@ -1602,10 +1944,10 @@ again:
1602 goto again; 1944 goto again;
1603 } 1945 }
1604 1946
1605 scrub_block_get(sblock); /* one for the added page */ 1947 scrub_block_get(sblock); /* one for the page added to the bio */
1606 atomic_inc(&sblock->outstanding_pages); 1948 atomic_inc(&sblock->outstanding_pages);
1607 sbio->page_count++; 1949 sbio->page_count++;
1608 if (sbio->page_count == sctx->pages_per_bio) 1950 if (sbio->page_count == sctx->pages_per_rd_bio)
1609 scrub_submit(sctx); 1951 scrub_submit(sctx);
1610 1952
1611 return 0; 1953 return 0;
@@ -1613,7 +1955,8 @@ again:
1613 1955
1614static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 1956static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1615 u64 physical, struct btrfs_device *dev, u64 flags, 1957 u64 physical, struct btrfs_device *dev, u64 flags,
1616 u64 gen, int mirror_num, u8 *csum, int force) 1958 u64 gen, int mirror_num, u8 *csum, int force,
1959 u64 physical_for_dev_replace)
1617{ 1960{
1618 struct scrub_block *sblock; 1961 struct scrub_block *sblock;
1619 int index; 1962 int index;
@@ -1654,6 +1997,7 @@ leave_nomem:
1654 spage->generation = gen; 1997 spage->generation = gen;
1655 spage->logical = logical; 1998 spage->logical = logical;
1656 spage->physical = physical; 1999 spage->physical = physical;
2000 spage->physical_for_dev_replace = physical_for_dev_replace;
1657 spage->mirror_num = mirror_num; 2001 spage->mirror_num = mirror_num;
1658 if (csum) { 2002 if (csum) {
1659 spage->have_csum = 1; 2003 spage->have_csum = 1;
@@ -1668,6 +2012,7 @@ leave_nomem:
1668 len -= l; 2012 len -= l;
1669 logical += l; 2013 logical += l;
1670 physical += l; 2014 physical += l;
2015 physical_for_dev_replace += l;
1671 } 2016 }
1672 2017
1673 WARN_ON(sblock->page_count == 0); 2018 WARN_ON(sblock->page_count == 0);
@@ -1675,7 +2020,7 @@ leave_nomem:
1675 struct scrub_page *spage = sblock->pagev[index]; 2020 struct scrub_page *spage = sblock->pagev[index];
1676 int ret; 2021 int ret;
1677 2022
1678 ret = scrub_add_page_to_bio(sctx, spage); 2023 ret = scrub_add_page_to_rd_bio(sctx, spage);
1679 if (ret) { 2024 if (ret) {
1680 scrub_block_put(sblock); 2025 scrub_block_put(sblock);
1681 return ret; 2026 return ret;
@@ -1707,7 +2052,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
1707 struct scrub_ctx *sctx = sbio->sctx; 2052 struct scrub_ctx *sctx = sbio->sctx;
1708 int i; 2053 int i;
1709 2054
1710 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); 2055 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
1711 if (sbio->err) { 2056 if (sbio->err) {
1712 for (i = 0; i < sbio->page_count; i++) { 2057 for (i = 0; i < sbio->page_count; i++) {
1713 struct scrub_page *spage = sbio->pagev[i]; 2058 struct scrub_page *spage = sbio->pagev[i];
@@ -1733,15 +2078,30 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
1733 sbio->next_free = sctx->first_free; 2078 sbio->next_free = sctx->first_free;
1734 sctx->first_free = sbio->index; 2079 sctx->first_free = sbio->index;
1735 spin_unlock(&sctx->list_lock); 2080 spin_unlock(&sctx->list_lock);
2081
2082 if (sctx->is_dev_replace &&
2083 atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2084 mutex_lock(&sctx->wr_ctx.wr_lock);
2085 scrub_wr_submit(sctx);
2086 mutex_unlock(&sctx->wr_ctx.wr_lock);
2087 }
2088
1736 scrub_pending_bio_dec(sctx); 2089 scrub_pending_bio_dec(sctx);
1737} 2090}
1738 2091
1739static void scrub_block_complete(struct scrub_block *sblock) 2092static void scrub_block_complete(struct scrub_block *sblock)
1740{ 2093{
1741 if (!sblock->no_io_error_seen) 2094 if (!sblock->no_io_error_seen) {
1742 scrub_handle_errored_block(sblock); 2095 scrub_handle_errored_block(sblock);
1743 else 2096 } else {
1744 scrub_checksum(sblock); 2097 /*
2098 * if has checksum error, write via repair mechanism in
2099 * dev replace case, otherwise write here in dev replace
2100 * case.
2101 */
2102 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2103 scrub_write_block_to_dev_replace(sblock);
2104 }
1745} 2105}
1746 2106
1747static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, 2107static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -1786,7 +2146,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
1786/* scrub extent tries to collect up to 64 kB for each bio */ 2146/* scrub extent tries to collect up to 64 kB for each bio */
1787static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, 2147static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1788 u64 physical, struct btrfs_device *dev, u64 flags, 2148 u64 physical, struct btrfs_device *dev, u64 flags,
1789 u64 gen, int mirror_num) 2149 u64 gen, int mirror_num, u64 physical_for_dev_replace)
1790{ 2150{
1791 int ret; 2151 int ret;
1792 u8 csum[BTRFS_CSUM_SIZE]; 2152 u8 csum[BTRFS_CSUM_SIZE];
@@ -1799,7 +2159,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1799 sctx->stat.data_bytes_scrubbed += len; 2159 sctx->stat.data_bytes_scrubbed += len;
1800 spin_unlock(&sctx->stat_lock); 2160 spin_unlock(&sctx->stat_lock);
1801 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2161 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1802 BUG_ON(sctx->nodesize != sctx->leafsize); 2162 WARN_ON(sctx->nodesize != sctx->leafsize);
1803 blocksize = sctx->nodesize; 2163 blocksize = sctx->nodesize;
1804 spin_lock(&sctx->stat_lock); 2164 spin_lock(&sctx->stat_lock);
1805 sctx->stat.tree_extents_scrubbed++; 2165 sctx->stat.tree_extents_scrubbed++;
@@ -1807,7 +2167,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1807 spin_unlock(&sctx->stat_lock); 2167 spin_unlock(&sctx->stat_lock);
1808 } else { 2168 } else {
1809 blocksize = sctx->sectorsize; 2169 blocksize = sctx->sectorsize;
1810 BUG_ON(1); 2170 WARN_ON(1);
1811 } 2171 }
1812 2172
1813 while (len) { 2173 while (len) {
@@ -1819,14 +2179,23 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1819 have_csum = scrub_find_csum(sctx, logical, l, csum); 2179 have_csum = scrub_find_csum(sctx, logical, l, csum);
1820 if (have_csum == 0) 2180 if (have_csum == 0)
1821 ++sctx->stat.no_csum; 2181 ++sctx->stat.no_csum;
2182 if (sctx->is_dev_replace && !have_csum) {
2183 ret = copy_nocow_pages(sctx, logical, l,
2184 mirror_num,
2185 physical_for_dev_replace);
2186 goto behind_scrub_pages;
2187 }
1822 } 2188 }
1823 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, 2189 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
1824 mirror_num, have_csum ? csum : NULL, 0); 2190 mirror_num, have_csum ? csum : NULL, 0,
2191 physical_for_dev_replace);
2192behind_scrub_pages:
1825 if (ret) 2193 if (ret)
1826 return ret; 2194 return ret;
1827 len -= l; 2195 len -= l;
1828 logical += l; 2196 logical += l;
1829 physical += l; 2197 physical += l;
2198 physical_for_dev_replace += l;
1830 } 2199 }
1831 return 0; 2200 return 0;
1832} 2201}
@@ -1834,7 +2203,8 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1834static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2203static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1835 struct map_lookup *map, 2204 struct map_lookup *map,
1836 struct btrfs_device *scrub_dev, 2205 struct btrfs_device *scrub_dev,
1837 int num, u64 base, u64 length) 2206 int num, u64 base, u64 length,
2207 int is_dev_replace)
1838{ 2208{
1839 struct btrfs_path *path; 2209 struct btrfs_path *path;
1840 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 2210 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
@@ -1859,6 +2229,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1859 struct btrfs_key key_end; 2229 struct btrfs_key key_end;
1860 u64 increment = map->stripe_len; 2230 u64 increment = map->stripe_len;
1861 u64 offset; 2231 u64 offset;
2232 u64 extent_logical;
2233 u64 extent_physical;
2234 u64 extent_len;
2235 struct btrfs_device *extent_dev;
2236 int extent_mirror_num;
1862 2237
1863 nstripes = length; 2238 nstripes = length;
1864 offset = 0; 2239 offset = 0;
@@ -1966,9 +2341,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1966 */ 2341 */
1967 if (atomic_read(&fs_info->scrub_pause_req)) { 2342 if (atomic_read(&fs_info->scrub_pause_req)) {
1968 /* push queued extents */ 2343 /* push queued extents */
2344 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
1969 scrub_submit(sctx); 2345 scrub_submit(sctx);
2346 mutex_lock(&sctx->wr_ctx.wr_lock);
2347 scrub_wr_submit(sctx);
2348 mutex_unlock(&sctx->wr_ctx.wr_lock);
1970 wait_event(sctx->list_wait, 2349 wait_event(sctx->list_wait,
1971 atomic_read(&sctx->bios_in_flight) == 0); 2350 atomic_read(&sctx->bios_in_flight) == 0);
2351 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
1972 atomic_inc(&fs_info->scrubs_paused); 2352 atomic_inc(&fs_info->scrubs_paused);
1973 wake_up(&fs_info->scrub_pause_wait); 2353 wake_up(&fs_info->scrub_pause_wait);
1974 mutex_lock(&fs_info->scrub_lock); 2354 mutex_lock(&fs_info->scrub_lock);
@@ -2063,10 +2443,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2063 key.objectid; 2443 key.objectid;
2064 } 2444 }
2065 2445
2066 ret = scrub_extent(sctx, key.objectid, key.offset, 2446 extent_logical = key.objectid;
2067 key.objectid - logical + physical, 2447 extent_physical = key.objectid - logical + physical;
2068 scrub_dev, flags, generation, 2448 extent_len = key.offset;
2069 mirror_num); 2449 extent_dev = scrub_dev;
2450 extent_mirror_num = mirror_num;
2451 if (is_dev_replace)
2452 scrub_remap_extent(fs_info, extent_logical,
2453 extent_len, &extent_physical,
2454 &extent_dev,
2455 &extent_mirror_num);
2456 ret = scrub_extent(sctx, extent_logical, extent_len,
2457 extent_physical, extent_dev, flags,
2458 generation, extent_mirror_num,
2459 key.objectid - logical + physical);
2070 if (ret) 2460 if (ret)
2071 goto out; 2461 goto out;
2072 2462
@@ -2080,10 +2470,13 @@ next:
2080 sctx->stat.last_physical = physical; 2470 sctx->stat.last_physical = physical;
2081 spin_unlock(&sctx->stat_lock); 2471 spin_unlock(&sctx->stat_lock);
2082 } 2472 }
2473out:
2083 /* push queued extents */ 2474 /* push queued extents */
2084 scrub_submit(sctx); 2475 scrub_submit(sctx);
2476 mutex_lock(&sctx->wr_ctx.wr_lock);
2477 scrub_wr_submit(sctx);
2478 mutex_unlock(&sctx->wr_ctx.wr_lock);
2085 2479
2086out:
2087 blk_finish_plug(&plug); 2480 blk_finish_plug(&plug);
2088 btrfs_free_path(path); 2481 btrfs_free_path(path);
2089 return ret < 0 ? ret : 0; 2482 return ret < 0 ? ret : 0;
@@ -2093,14 +2486,14 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2093 struct btrfs_device *scrub_dev, 2486 struct btrfs_device *scrub_dev,
2094 u64 chunk_tree, u64 chunk_objectid, 2487 u64 chunk_tree, u64 chunk_objectid,
2095 u64 chunk_offset, u64 length, 2488 u64 chunk_offset, u64 length,
2096 u64 dev_offset) 2489 u64 dev_offset, int is_dev_replace)
2097{ 2490{
2098 struct btrfs_mapping_tree *map_tree = 2491 struct btrfs_mapping_tree *map_tree =
2099 &sctx->dev_root->fs_info->mapping_tree; 2492 &sctx->dev_root->fs_info->mapping_tree;
2100 struct map_lookup *map; 2493 struct map_lookup *map;
2101 struct extent_map *em; 2494 struct extent_map *em;
2102 int i; 2495 int i;
2103 int ret = -EINVAL; 2496 int ret = 0;
2104 2497
2105 read_lock(&map_tree->map_tree.lock); 2498 read_lock(&map_tree->map_tree.lock);
2106 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2499 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -2120,7 +2513,8 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2120 if (map->stripes[i].dev->bdev == scrub_dev->bdev && 2513 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2121 map->stripes[i].physical == dev_offset) { 2514 map->stripes[i].physical == dev_offset) {
2122 ret = scrub_stripe(sctx, map, scrub_dev, i, 2515 ret = scrub_stripe(sctx, map, scrub_dev, i,
2123 chunk_offset, length); 2516 chunk_offset, length,
2517 is_dev_replace);
2124 if (ret) 2518 if (ret)
2125 goto out; 2519 goto out;
2126 } 2520 }
@@ -2133,7 +2527,8 @@ out:
2133 2527
2134static noinline_for_stack 2528static noinline_for_stack
2135int scrub_enumerate_chunks(struct scrub_ctx *sctx, 2529int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2136 struct btrfs_device *scrub_dev, u64 start, u64 end) 2530 struct btrfs_device *scrub_dev, u64 start, u64 end,
2531 int is_dev_replace)
2137{ 2532{
2138 struct btrfs_dev_extent *dev_extent = NULL; 2533 struct btrfs_dev_extent *dev_extent = NULL;
2139 struct btrfs_path *path; 2534 struct btrfs_path *path;
@@ -2149,6 +2544,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2149 struct btrfs_key key; 2544 struct btrfs_key key;
2150 struct btrfs_key found_key; 2545 struct btrfs_key found_key;
2151 struct btrfs_block_group_cache *cache; 2546 struct btrfs_block_group_cache *cache;
2547 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2152 2548
2153 path = btrfs_alloc_path(); 2549 path = btrfs_alloc_path();
2154 if (!path) 2550 if (!path)
@@ -2214,11 +2610,61 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2214 ret = -ENOENT; 2610 ret = -ENOENT;
2215 break; 2611 break;
2216 } 2612 }
2613 dev_replace->cursor_right = found_key.offset + length;
2614 dev_replace->cursor_left = found_key.offset;
2615 dev_replace->item_needs_writeback = 1;
2217 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, 2616 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2218 chunk_offset, length, found_key.offset); 2617 chunk_offset, length, found_key.offset,
2618 is_dev_replace);
2619
2620 /*
2621 * flush, submit all pending read and write bios, afterwards
2622 * wait for them.
2623 * Note that in the dev replace case, a read request causes
2624 * write requests that are submitted in the read completion
2625 * worker. Therefore in the current situation, it is required
2626 * that all write requests are flushed, so that all read and
2627 * write requests are really completed when bios_in_flight
2628 * changes to 0.
2629 */
2630 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2631 scrub_submit(sctx);
2632 mutex_lock(&sctx->wr_ctx.wr_lock);
2633 scrub_wr_submit(sctx);
2634 mutex_unlock(&sctx->wr_ctx.wr_lock);
2635
2636 wait_event(sctx->list_wait,
2637 atomic_read(&sctx->bios_in_flight) == 0);
2638 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2639 atomic_inc(&fs_info->scrubs_paused);
2640 wake_up(&fs_info->scrub_pause_wait);
2641 wait_event(sctx->list_wait,
2642 atomic_read(&sctx->workers_pending) == 0);
2643
2644 mutex_lock(&fs_info->scrub_lock);
2645 while (atomic_read(&fs_info->scrub_pause_req)) {
2646 mutex_unlock(&fs_info->scrub_lock);
2647 wait_event(fs_info->scrub_pause_wait,
2648 atomic_read(&fs_info->scrub_pause_req) == 0);
2649 mutex_lock(&fs_info->scrub_lock);
2650 }
2651 atomic_dec(&fs_info->scrubs_paused);
2652 mutex_unlock(&fs_info->scrub_lock);
2653 wake_up(&fs_info->scrub_pause_wait);
2654
2655 dev_replace->cursor_left = dev_replace->cursor_right;
2656 dev_replace->item_needs_writeback = 1;
2219 btrfs_put_block_group(cache); 2657 btrfs_put_block_group(cache);
2220 if (ret) 2658 if (ret)
2221 break; 2659 break;
2660 if (atomic64_read(&dev_replace->num_write_errors) > 0) {
2661 ret = -EIO;
2662 break;
2663 }
2664 if (sctx->stat.malloc_errors > 0) {
2665 ret = -ENOMEM;
2666 break;
2667 }
2222 2668
2223 key.offset = found_key.offset + length; 2669 key.offset = found_key.offset + length;
2224 btrfs_release_path(path); 2670 btrfs_release_path(path);
@@ -2254,7 +2700,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2254 2700
2255 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2701 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2256 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, 2702 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2257 NULL, 1); 2703 NULL, 1, bytenr);
2258 if (ret) 2704 if (ret)
2259 return ret; 2705 return ret;
2260 } 2706 }
@@ -2266,18 +2712,38 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2266/* 2712/*
2267 * get a reference count on fs_info->scrub_workers. start worker if necessary 2713 * get a reference count on fs_info->scrub_workers. start worker if necessary
2268 */ 2714 */
2269static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) 2715static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2716 int is_dev_replace)
2270{ 2717{
2271 int ret = 0; 2718 int ret = 0;
2272 2719
2273 mutex_lock(&fs_info->scrub_lock); 2720 mutex_lock(&fs_info->scrub_lock);
2274 if (fs_info->scrub_workers_refcnt == 0) { 2721 if (fs_info->scrub_workers_refcnt == 0) {
2275 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2722 if (is_dev_replace)
2276 fs_info->thread_pool_size, &fs_info->generic_worker); 2723 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2724 &fs_info->generic_worker);
2725 else
2726 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2727 fs_info->thread_pool_size,
2728 &fs_info->generic_worker);
2277 fs_info->scrub_workers.idle_thresh = 4; 2729 fs_info->scrub_workers.idle_thresh = 4;
2278 ret = btrfs_start_workers(&fs_info->scrub_workers); 2730 ret = btrfs_start_workers(&fs_info->scrub_workers);
2279 if (ret) 2731 if (ret)
2280 goto out; 2732 goto out;
2733 btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2734 "scrubwrc",
2735 fs_info->thread_pool_size,
2736 &fs_info->generic_worker);
2737 fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2738 ret = btrfs_start_workers(
2739 &fs_info->scrub_wr_completion_workers);
2740 if (ret)
2741 goto out;
2742 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2743 &fs_info->generic_worker);
2744 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2745 if (ret)
2746 goto out;
2281 } 2747 }
2282 ++fs_info->scrub_workers_refcnt; 2748 ++fs_info->scrub_workers_refcnt;
2283out: 2749out:
@@ -2289,8 +2755,11 @@ out:
2289static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) 2755static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2290{ 2756{
2291 mutex_lock(&fs_info->scrub_lock); 2757 mutex_lock(&fs_info->scrub_lock);
2292 if (--fs_info->scrub_workers_refcnt == 0) 2758 if (--fs_info->scrub_workers_refcnt == 0) {
2293 btrfs_stop_workers(&fs_info->scrub_workers); 2759 btrfs_stop_workers(&fs_info->scrub_workers);
2760 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2761 btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2762 }
2294 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2763 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2295 mutex_unlock(&fs_info->scrub_lock); 2764 mutex_unlock(&fs_info->scrub_lock);
2296} 2765}
@@ -2354,7 +2823,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2354 return -EINVAL; 2823 return -EINVAL;
2355 } 2824 }
2356 2825
2357 ret = scrub_workers_get(fs_info); 2826 ret = scrub_workers_get(fs_info, is_dev_replace);
2358 if (ret) 2827 if (ret)
2359 return ret; 2828 return ret;
2360 2829
@@ -2394,12 +2863,15 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2394 mutex_unlock(&fs_info->scrub_lock); 2863 mutex_unlock(&fs_info->scrub_lock);
2395 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2864 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2396 2865
2397 down_read(&fs_info->scrub_super_lock); 2866 if (!is_dev_replace) {
2398 ret = scrub_supers(sctx, dev); 2867 down_read(&fs_info->scrub_super_lock);
2399 up_read(&fs_info->scrub_super_lock); 2868 ret = scrub_supers(sctx, dev);
2869 up_read(&fs_info->scrub_super_lock);
2870 }
2400 2871
2401 if (!ret) 2872 if (!ret)
2402 ret = scrub_enumerate_chunks(sctx, dev, start, end); 2873 ret = scrub_enumerate_chunks(sctx, dev, start, end,
2874 is_dev_replace);
2403 2875
2404 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 2876 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2405 atomic_dec(&fs_info->scrubs_running); 2877 atomic_dec(&fs_info->scrubs_running);
@@ -2537,3 +3009,272 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2537 3009
2538 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 3010 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
2539} 3011}
3012
3013static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3014 u64 extent_logical, u64 extent_len,
3015 u64 *extent_physical,
3016 struct btrfs_device **extent_dev,
3017 int *extent_mirror_num)
3018{
3019 u64 mapped_length;
3020 struct btrfs_bio *bbio = NULL;
3021 int ret;
3022
3023 mapped_length = extent_len;
3024 ret = btrfs_map_block(fs_info, READ, extent_logical,
3025 &mapped_length, &bbio, 0);
3026 if (ret || !bbio || mapped_length < extent_len ||
3027 !bbio->stripes[0].dev->bdev) {
3028 kfree(bbio);
3029 return;
3030 }
3031
3032 *extent_physical = bbio->stripes[0].physical;
3033 *extent_mirror_num = bbio->mirror_num;
3034 *extent_dev = bbio->stripes[0].dev;
3035 kfree(bbio);
3036}
3037
3038static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3039 struct scrub_wr_ctx *wr_ctx,
3040 struct btrfs_fs_info *fs_info,
3041 struct btrfs_device *dev,
3042 int is_dev_replace)
3043{
3044 WARN_ON(wr_ctx->wr_curr_bio != NULL);
3045
3046 mutex_init(&wr_ctx->wr_lock);
3047 wr_ctx->wr_curr_bio = NULL;
3048 if (!is_dev_replace)
3049 return 0;
3050
3051 WARN_ON(!dev->bdev);
3052 wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3053 bio_get_nr_vecs(dev->bdev));
3054 wr_ctx->tgtdev = dev;
3055 atomic_set(&wr_ctx->flush_all_writes, 0);
3056 return 0;
3057}
3058
3059static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3060{
3061 mutex_lock(&wr_ctx->wr_lock);
3062 kfree(wr_ctx->wr_curr_bio);
3063 wr_ctx->wr_curr_bio = NULL;
3064 mutex_unlock(&wr_ctx->wr_lock);
3065}
3066
3067static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3068 int mirror_num, u64 physical_for_dev_replace)
3069{
3070 struct scrub_copy_nocow_ctx *nocow_ctx;
3071 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3072
3073 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3074 if (!nocow_ctx) {
3075 spin_lock(&sctx->stat_lock);
3076 sctx->stat.malloc_errors++;
3077 spin_unlock(&sctx->stat_lock);
3078 return -ENOMEM;
3079 }
3080
3081 scrub_pending_trans_workers_inc(sctx);
3082
3083 nocow_ctx->sctx = sctx;
3084 nocow_ctx->logical = logical;
3085 nocow_ctx->len = len;
3086 nocow_ctx->mirror_num = mirror_num;
3087 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3088 nocow_ctx->work.func = copy_nocow_pages_worker;
3089 btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3090 &nocow_ctx->work);
3091
3092 return 0;
3093}
3094
3095static void copy_nocow_pages_worker(struct btrfs_work *work)
3096{
3097 struct scrub_copy_nocow_ctx *nocow_ctx =
3098 container_of(work, struct scrub_copy_nocow_ctx, work);
3099 struct scrub_ctx *sctx = nocow_ctx->sctx;
3100 u64 logical = nocow_ctx->logical;
3101 u64 len = nocow_ctx->len;
3102 int mirror_num = nocow_ctx->mirror_num;
3103 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3104 int ret;
3105 struct btrfs_trans_handle *trans = NULL;
3106 struct btrfs_fs_info *fs_info;
3107 struct btrfs_path *path;
3108 struct btrfs_root *root;
3109 int not_written = 0;
3110
3111 fs_info = sctx->dev_root->fs_info;
3112 root = fs_info->extent_root;
3113
3114 path = btrfs_alloc_path();
3115 if (!path) {
3116 spin_lock(&sctx->stat_lock);
3117 sctx->stat.malloc_errors++;
3118 spin_unlock(&sctx->stat_lock);
3119 not_written = 1;
3120 goto out;
3121 }
3122
3123 trans = btrfs_join_transaction(root);
3124 if (IS_ERR(trans)) {
3125 not_written = 1;
3126 goto out;
3127 }
3128
3129 ret = iterate_inodes_from_logical(logical, fs_info, path,
3130 copy_nocow_pages_for_inode,
3131 nocow_ctx);
3132 if (ret != 0 && ret != -ENOENT) {
3133 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
3134 (unsigned long long)logical,
3135 (unsigned long long)physical_for_dev_replace,
3136 (unsigned long long)len,
3137 (unsigned long long)mirror_num, ret);
3138 not_written = 1;
3139 goto out;
3140 }
3141
3142out:
3143 if (trans && !IS_ERR(trans))
3144 btrfs_end_transaction(trans, root);
3145 if (not_written)
3146 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3147 num_uncorrectable_read_errors);
3148
3149 btrfs_free_path(path);
3150 kfree(nocow_ctx);
3151
3152 scrub_pending_trans_workers_dec(sctx);
3153}
3154
3155static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3156{
3157 unsigned long index;
3158 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3159 int ret = 0;
3160 struct btrfs_key key;
3161 struct inode *inode = NULL;
3162 struct btrfs_root *local_root;
3163 u64 physical_for_dev_replace;
3164 u64 len;
3165 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3166
3167 key.objectid = root;
3168 key.type = BTRFS_ROOT_ITEM_KEY;
3169 key.offset = (u64)-1;
3170 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3171 if (IS_ERR(local_root))
3172 return PTR_ERR(local_root);
3173
3174 key.type = BTRFS_INODE_ITEM_KEY;
3175 key.objectid = inum;
3176 key.offset = 0;
3177 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3178 if (IS_ERR(inode))
3179 return PTR_ERR(inode);
3180
3181 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3182 len = nocow_ctx->len;
3183 while (len >= PAGE_CACHE_SIZE) {
3184 struct page *page = NULL;
3185 int ret_sub;
3186
3187 index = offset >> PAGE_CACHE_SHIFT;
3188
3189 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3190 if (!page) {
3191 pr_err("find_or_create_page() failed\n");
3192 ret = -ENOMEM;
3193 goto next_page;
3194 }
3195
3196 if (PageUptodate(page)) {
3197 if (PageDirty(page))
3198 goto next_page;
3199 } else {
3200 ClearPageError(page);
3201 ret_sub = extent_read_full_page(&BTRFS_I(inode)->
3202 io_tree,
3203 page, btrfs_get_extent,
3204 nocow_ctx->mirror_num);
3205 if (ret_sub) {
3206 ret = ret_sub;
3207 goto next_page;
3208 }
3209 wait_on_page_locked(page);
3210 if (!PageUptodate(page)) {
3211 ret = -EIO;
3212 goto next_page;
3213 }
3214 }
3215 ret_sub = write_page_nocow(nocow_ctx->sctx,
3216 physical_for_dev_replace, page);
3217 if (ret_sub) {
3218 ret = ret_sub;
3219 goto next_page;
3220 }
3221
3222next_page:
3223 if (page) {
3224 unlock_page(page);
3225 put_page(page);
3226 }
3227 offset += PAGE_CACHE_SIZE;
3228 physical_for_dev_replace += PAGE_CACHE_SIZE;
3229 len -= PAGE_CACHE_SIZE;
3230 }
3231
3232 if (inode)
3233 iput(inode);
3234 return ret;
3235}
3236
3237static int write_page_nocow(struct scrub_ctx *sctx,
3238 u64 physical_for_dev_replace, struct page *page)
3239{
3240 struct bio *bio;
3241 struct btrfs_device *dev;
3242 int ret;
3243 DECLARE_COMPLETION_ONSTACK(compl);
3244
3245 dev = sctx->wr_ctx.tgtdev;
3246 if (!dev)
3247 return -EIO;
3248 if (!dev->bdev) {
3249 printk_ratelimited(KERN_WARNING
3250 "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3251 return -EIO;
3252 }
3253 bio = bio_alloc(GFP_NOFS, 1);
3254 if (!bio) {
3255 spin_lock(&sctx->stat_lock);
3256 sctx->stat.malloc_errors++;
3257 spin_unlock(&sctx->stat_lock);
3258 return -ENOMEM;
3259 }
3260 bio->bi_private = &compl;
3261 bio->bi_end_io = scrub_complete_bio_end_io;
3262 bio->bi_size = 0;
3263 bio->bi_sector = physical_for_dev_replace >> 9;
3264 bio->bi_bdev = dev->bdev;
3265 ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3266 if (ret != PAGE_CACHE_SIZE) {
3267leave_with_eio:
3268 bio_put(bio);
3269 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3270 return -EIO;
3271 }
3272 btrfsic_submit_bio(WRITE_SYNC, bio);
3273 wait_for_completion(&compl);
3274
3275 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
3276 goto leave_with_eio;
3277
3278 bio_put(bio);
3279 return 0;
3280}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 837ad2d27853..ad4380684b9b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1195,7 +1195,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1195 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1195 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
1196 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1196 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
1197 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1197 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
1198 btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); 1198 btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
1199 new_pool_size);
1199} 1200}
1200 1201
1201static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1202static int btrfs_remount(struct super_block *sb, int *flags, char *data)