aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorStefan Behrens <sbehrens@giantdisaster.de>2012-11-06 05:43:11 -0500
committerJosef Bacik <jbacik@fusionio.com>2012-12-12 17:15:41 -0500
commitff023aac31198e88507d626825379b28ea481d4d (patch)
tree7798bcbc762fdd3c91b013b6bb8264e3087a215d /fs/btrfs
parent618919236ba54361e93106f4951d233a7ade63cd (diff)
Btrfs: add code to scrub to copy read data to another disk
The device replace procedure makes use of the scrub code. The scrub code is the most efficient code to read the allocated data of a disk, i.e. it reads sequentially in order to avoid disk head movements, it skips unallocated blocks, it uses read ahead mechanisms, and it contains all the code to detect and repair defects. This commit adds code to scrub to allow the scrub code to copy read data to another disk. One goal is to be able to perform as fast as possible. Therefore the write requests are collected until huge bios are built, and the write process is decoupled from the read process with some kind of flow control, of course, in order to limit the allocated memory. The best performance on spinning disks could by reached when the head movements are avoided as much as possible. Therefore a single worker is used to interface the read process with the write process. The regular scrub operation works as fast as before, it is not negatively influenced and actually it is more or less unchanged. Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/dev-replace.h26
-rw-r--r--fs/btrfs/reada.c10
-rw-r--r--fs/btrfs/scrub.c883
-rw-r--r--fs/btrfs/super.c3
5 files changed, 851 insertions, 73 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746cb6aa1f62..ded7caa0d304 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1483,6 +1483,8 @@ struct btrfs_fs_info {
1483 struct rw_semaphore scrub_super_lock; 1483 struct rw_semaphore scrub_super_lock;
1484 int scrub_workers_refcnt; 1484 int scrub_workers_refcnt;
1485 struct btrfs_workers scrub_workers; 1485 struct btrfs_workers scrub_workers;
1486 struct btrfs_workers scrub_wr_completion_workers;
1487 struct btrfs_workers scrub_nocow_workers;
1486 1488
1487#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1489#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1488 u32 check_integrity_print_mask; 1490 u32 check_integrity_print_mask;
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000000..1fb5c89037ee
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,26 @@
1/*
2 * Copyright (C) STRATO AG 2012. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#if !defined(__BTRFS_DEV_REPLACE__)
20#define __BTRFS_DEV_REPLACE__
21
22static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
23{
24 atomic64_inc(stat_value);
25}
26#endif
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 0ddc5659f946..9f363e17ec74 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -418,12 +418,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
418 */ 418 */
419 continue; 419 continue;
420 } 420 }
421 if (!dev->bdev) {
422 /* cannot read ahead on missing device */
423 continue;
424 }
421 prev_dev = dev; 425 prev_dev = dev;
422 ret = radix_tree_insert(&dev->reada_extents, index, re); 426 ret = radix_tree_insert(&dev->reada_extents, index, re);
423 if (ret) { 427 if (ret) {
424 while (--i >= 0) { 428 while (--i >= 0) {
425 dev = bbio->stripes[i].dev; 429 dev = bbio->stripes[i].dev;
426 BUG_ON(dev == NULL); 430 BUG_ON(dev == NULL);
431 /* ignore whether the entry was inserted */
427 radix_tree_delete(&dev->reada_extents, index); 432 radix_tree_delete(&dev->reada_extents, index);
428 } 433 }
429 BUG_ON(fs_info == NULL); 434 BUG_ON(fs_info == NULL);
@@ -914,7 +919,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
914 generation = btrfs_header_generation(node); 919 generation = btrfs_header_generation(node);
915 free_extent_buffer(node); 920 free_extent_buffer(node);
916 921
917 reada_add_block(rc, start, &max_key, level, generation); 922 if (reada_add_block(rc, start, &max_key, level, generation)) {
923 kfree(rc);
924 return ERR_PTR(-ENOMEM);
925 }
918 926
919 reada_start_machine(root->fs_info); 927 reada_start_machine(root->fs_info);
920 928
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 460e30bb1884..61157a26cf2a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "backref.h" 26#include "backref.h"
27#include "extent_io.h" 27#include "extent_io.h"
28#include "dev-replace.h"
28#include "check-integrity.h" 29#include "check-integrity.h"
29#include "rcu-string.h" 30#include "rcu-string.h"
30 31
@@ -44,8 +45,15 @@
44struct scrub_block; 45struct scrub_block;
45struct scrub_ctx; 46struct scrub_ctx;
46 47
47#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 48/*
48#define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */ 49 * the following three values only influence the performance.
50 * The last one configures the number of parallel and outstanding I/O
51 * operations. The first two values configure an upper limit for the number
52 * of (dynamically allocated) pages that are added to a bio.
53 */
54#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
55#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
56#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
49 57
50/* 58/*
51 * the following value times PAGE_SIZE needs to be large enough to match the 59 * the following value times PAGE_SIZE needs to be large enough to match the
@@ -62,6 +70,7 @@ struct scrub_page {
62 u64 generation; 70 u64 generation;
63 u64 logical; 71 u64 logical;
64 u64 physical; 72 u64 physical;
73 u64 physical_for_dev_replace;
65 atomic_t ref_count; 74 atomic_t ref_count;
66 struct { 75 struct {
67 unsigned int mirror_num:8; 76 unsigned int mirror_num:8;
@@ -79,7 +88,11 @@ struct scrub_bio {
79 int err; 88 int err;
80 u64 logical; 89 u64 logical;
81 u64 physical; 90 u64 physical;
82 struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; 91#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
93#else
94 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
95#endif
83 int page_count; 96 int page_count;
84 int next_free; 97 int next_free;
85 struct btrfs_work work; 98 struct btrfs_work work;
@@ -99,8 +112,16 @@ struct scrub_block {
99 }; 112 };
100}; 113};
101 114
115struct scrub_wr_ctx {
116 struct scrub_bio *wr_curr_bio;
117 struct btrfs_device *tgtdev;
118 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
119 atomic_t flush_all_writes;
120 struct mutex wr_lock;
121};
122
102struct scrub_ctx { 123struct scrub_ctx {
103 struct scrub_bio *bios[SCRUB_BIOS_PER_CTX]; 124 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
104 struct btrfs_root *dev_root; 125 struct btrfs_root *dev_root;
105 int first_free; 126 int first_free;
106 int curr; 127 int curr;
@@ -112,12 +133,13 @@ struct scrub_ctx {
112 struct list_head csum_list; 133 struct list_head csum_list;
113 atomic_t cancel_req; 134 atomic_t cancel_req;
114 int readonly; 135 int readonly;
115 int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ 136 int pages_per_rd_bio;
116 u32 sectorsize; 137 u32 sectorsize;
117 u32 nodesize; 138 u32 nodesize;
118 u32 leafsize; 139 u32 leafsize;
119 140
120 int is_dev_replace; 141 int is_dev_replace;
142 struct scrub_wr_ctx wr_ctx;
121 143
122 /* 144 /*
123 * statistics 145 * statistics
@@ -135,6 +157,15 @@ struct scrub_fixup_nodatasum {
135 int mirror_num; 157 int mirror_num;
136}; 158};
137 159
160struct scrub_copy_nocow_ctx {
161 struct scrub_ctx *sctx;
162 u64 logical;
163 u64 len;
164 int mirror_num;
165 u64 physical_for_dev_replace;
166 struct btrfs_work work;
167};
168
138struct scrub_warning { 169struct scrub_warning {
139 struct btrfs_path *path; 170 struct btrfs_path *path;
140 u64 extent_item_size; 171 u64 extent_item_size;
@@ -156,8 +187,9 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
156static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 187static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
157static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 188static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
158 struct btrfs_fs_info *fs_info, 189 struct btrfs_fs_info *fs_info,
190 struct scrub_block *original_sblock,
159 u64 length, u64 logical, 191 u64 length, u64 logical,
160 struct scrub_block *sblock); 192 struct scrub_block *sblocks_for_recheck);
161static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 193static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
162 struct scrub_block *sblock, int is_metadata, 194 struct scrub_block *sblock, int is_metadata,
163 int have_csum, u8 *csum, u64 generation, 195 int have_csum, u8 *csum, u64 generation,
@@ -174,6 +206,9 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
174static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 206static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
175 struct scrub_block *sblock_good, 207 struct scrub_block *sblock_good,
176 int page_num, int force_write); 208 int page_num, int force_write);
209static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
210static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
211 int page_num);
177static int scrub_checksum_data(struct scrub_block *sblock); 212static int scrub_checksum_data(struct scrub_block *sblock);
178static int scrub_checksum_tree_block(struct scrub_block *sblock); 213static int scrub_checksum_tree_block(struct scrub_block *sblock);
179static int scrub_checksum_super(struct scrub_block *sblock); 214static int scrub_checksum_super(struct scrub_block *sblock);
@@ -181,14 +216,38 @@ static void scrub_block_get(struct scrub_block *sblock);
181static void scrub_block_put(struct scrub_block *sblock); 216static void scrub_block_put(struct scrub_block *sblock);
182static void scrub_page_get(struct scrub_page *spage); 217static void scrub_page_get(struct scrub_page *spage);
183static void scrub_page_put(struct scrub_page *spage); 218static void scrub_page_put(struct scrub_page *spage);
184static int scrub_add_page_to_bio(struct scrub_ctx *sctx, 219static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
185 struct scrub_page *spage); 220 struct scrub_page *spage);
186static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 221static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
187 u64 physical, struct btrfs_device *dev, u64 flags, 222 u64 physical, struct btrfs_device *dev, u64 flags,
188 u64 gen, int mirror_num, u8 *csum, int force); 223 u64 gen, int mirror_num, u8 *csum, int force,
224 u64 physical_for_dev_replace);
189static void scrub_bio_end_io(struct bio *bio, int err); 225static void scrub_bio_end_io(struct bio *bio, int err);
190static void scrub_bio_end_io_worker(struct btrfs_work *work); 226static void scrub_bio_end_io_worker(struct btrfs_work *work);
191static void scrub_block_complete(struct scrub_block *sblock); 227static void scrub_block_complete(struct scrub_block *sblock);
228static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
229 u64 extent_logical, u64 extent_len,
230 u64 *extent_physical,
231 struct btrfs_device **extent_dev,
232 int *extent_mirror_num);
233static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
234 struct scrub_wr_ctx *wr_ctx,
235 struct btrfs_fs_info *fs_info,
236 struct btrfs_device *dev,
237 int is_dev_replace);
238static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
239static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
240 struct scrub_page *spage);
241static void scrub_wr_submit(struct scrub_ctx *sctx);
242static void scrub_wr_bio_end_io(struct bio *bio, int err);
243static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
244static int write_page_nocow(struct scrub_ctx *sctx,
245 u64 physical_for_dev_replace, struct page *page);
246static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
247 void *ctx);
248static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
249 int mirror_num, u64 physical_for_dev_replace);
250static void copy_nocow_pages_worker(struct btrfs_work *work);
192 251
193 252
194static void scrub_pending_bio_inc(struct scrub_ctx *sctx) 253static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -262,19 +321,20 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
262 if (!sctx) 321 if (!sctx)
263 return; 322 return;
264 323
324 scrub_free_wr_ctx(&sctx->wr_ctx);
325
265 /* this can happen when scrub is cancelled */ 326 /* this can happen when scrub is cancelled */
266 if (sctx->curr != -1) { 327 if (sctx->curr != -1) {
267 struct scrub_bio *sbio = sctx->bios[sctx->curr]; 328 struct scrub_bio *sbio = sctx->bios[sctx->curr];
268 329
269 for (i = 0; i < sbio->page_count; i++) { 330 for (i = 0; i < sbio->page_count; i++) {
270 BUG_ON(!sbio->pagev[i]); 331 WARN_ON(!sbio->pagev[i]->page);
271 BUG_ON(!sbio->pagev[i]->page);
272 scrub_block_put(sbio->pagev[i]->sblock); 332 scrub_block_put(sbio->pagev[i]->sblock);
273 } 333 }
274 bio_put(sbio->bio); 334 bio_put(sbio->bio);
275 } 335 }
276 336
277 for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { 337 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
278 struct scrub_bio *sbio = sctx->bios[i]; 338 struct scrub_bio *sbio = sctx->bios[i];
279 339
280 if (!sbio) 340 if (!sbio)
@@ -292,18 +352,29 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
292 struct scrub_ctx *sctx; 352 struct scrub_ctx *sctx;
293 int i; 353 int i;
294 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 354 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
295 int pages_per_bio; 355 int pages_per_rd_bio;
356 int ret;
296 357
297 pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, 358 /*
298 bio_get_nr_vecs(dev->bdev)); 359 * the setting of pages_per_rd_bio is correct for scrub but might
360 * be wrong for the dev_replace code where we might read from
361 * different devices in the initial huge bios. However, that
362 * code is able to correctly handle the case when adding a page
363 * to a bio fails.
364 */
365 if (dev->bdev)
366 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
367 bio_get_nr_vecs(dev->bdev));
368 else
369 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
299 sctx = kzalloc(sizeof(*sctx), GFP_NOFS); 370 sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
300 if (!sctx) 371 if (!sctx)
301 goto nomem; 372 goto nomem;
302 sctx->is_dev_replace = is_dev_replace; 373 sctx->is_dev_replace = is_dev_replace;
303 sctx->pages_per_bio = pages_per_bio; 374 sctx->pages_per_rd_bio = pages_per_rd_bio;
304 sctx->curr = -1; 375 sctx->curr = -1;
305 sctx->dev_root = dev->dev_root; 376 sctx->dev_root = dev->dev_root;
306 for (i = 0; i < SCRUB_BIOS_PER_CTX; ++i) { 377 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
307 struct scrub_bio *sbio; 378 struct scrub_bio *sbio;
308 379
309 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 380 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
@@ -316,7 +387,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
316 sbio->page_count = 0; 387 sbio->page_count = 0;
317 sbio->work.func = scrub_bio_end_io_worker; 388 sbio->work.func = scrub_bio_end_io_worker;
318 389
319 if (i != SCRUB_BIOS_PER_CTX - 1) 390 if (i != SCRUB_BIOS_PER_SCTX - 1)
320 sctx->bios[i]->next_free = i + 1; 391 sctx->bios[i]->next_free = i + 1;
321 else 392 else
322 sctx->bios[i]->next_free = -1; 393 sctx->bios[i]->next_free = -1;
@@ -334,6 +405,13 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
334 spin_lock_init(&sctx->list_lock); 405 spin_lock_init(&sctx->list_lock);
335 spin_lock_init(&sctx->stat_lock); 406 spin_lock_init(&sctx->stat_lock);
336 init_waitqueue_head(&sctx->list_wait); 407 init_waitqueue_head(&sctx->list_wait);
408
409 ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
410 fs_info->dev_replace.tgtdev, is_dev_replace);
411 if (ret) {
412 scrub_free_ctx(sctx);
413 return ERR_PTR(ret);
414 }
337 return sctx; 415 return sctx;
338 416
339nomem: 417nomem:
@@ -341,7 +419,8 @@ nomem:
341 return ERR_PTR(-ENOMEM); 419 return ERR_PTR(-ENOMEM);
342} 420}
343 421
344static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) 422static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
423 void *warn_ctx)
345{ 424{
346 u64 isize; 425 u64 isize;
347 u32 nlink; 426 u32 nlink;
@@ -349,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
349 int i; 428 int i;
350 struct extent_buffer *eb; 429 struct extent_buffer *eb;
351 struct btrfs_inode_item *inode_item; 430 struct btrfs_inode_item *inode_item;
352 struct scrub_warning *swarn = ctx; 431 struct scrub_warning *swarn = warn_ctx;
353 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 432 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
354 struct inode_fs_paths *ipath = NULL; 433 struct inode_fs_paths *ipath = NULL;
355 struct btrfs_root *local_root; 434 struct btrfs_root *local_root;
@@ -492,11 +571,11 @@ out:
492 kfree(swarn.msg_buf); 571 kfree(swarn.msg_buf);
493} 572}
494 573
495static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) 574static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
496{ 575{
497 struct page *page = NULL; 576 struct page *page = NULL;
498 unsigned long index; 577 unsigned long index;
499 struct scrub_fixup_nodatasum *fixup = ctx; 578 struct scrub_fixup_nodatasum *fixup = fixup_ctx;
500 int ret; 579 int ret;
501 int corrected = 0; 580 int corrected = 0;
502 struct btrfs_key key; 581 struct btrfs_key key;
@@ -660,7 +739,9 @@ out:
660 spin_lock(&sctx->stat_lock); 739 spin_lock(&sctx->stat_lock);
661 ++sctx->stat.uncorrectable_errors; 740 ++sctx->stat.uncorrectable_errors;
662 spin_unlock(&sctx->stat_lock); 741 spin_unlock(&sctx->stat_lock);
663 742 btrfs_dev_replace_stats_inc(
743 &sctx->dev_root->fs_info->dev_replace.
744 num_uncorrectable_read_errors);
664 printk_ratelimited_in_rcu(KERN_ERR 745 printk_ratelimited_in_rcu(KERN_ERR
665 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 746 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
666 (unsigned long long)fixup->logical, 747 (unsigned long long)fixup->logical,
@@ -715,6 +796,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
715 csum = sblock_to_check->pagev[0]->csum; 796 csum = sblock_to_check->pagev[0]->csum;
716 dev = sblock_to_check->pagev[0]->dev; 797 dev = sblock_to_check->pagev[0]->dev;
717 798
799 if (sctx->is_dev_replace && !is_metadata && !have_csum) {
800 sblocks_for_recheck = NULL;
801 goto nodatasum_case;
802 }
803
718 /* 804 /*
719 * read all mirrors one after the other. This includes to 805 * read all mirrors one after the other. This includes to
720 * re-read the extent or metadata block that failed (that was 806 * re-read the extent or metadata block that failed (that was
@@ -758,7 +844,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
758 } 844 }
759 845
760 /* setup the context, map the logical blocks and alloc the pages */ 846 /* setup the context, map the logical blocks and alloc the pages */
761 ret = scrub_setup_recheck_block(sctx, fs_info, length, 847 ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
762 logical, sblocks_for_recheck); 848 logical, sblocks_for_recheck);
763 if (ret) { 849 if (ret) {
764 spin_lock(&sctx->stat_lock); 850 spin_lock(&sctx->stat_lock);
@@ -789,6 +875,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
789 sctx->stat.unverified_errors++; 875 sctx->stat.unverified_errors++;
790 spin_unlock(&sctx->stat_lock); 876 spin_unlock(&sctx->stat_lock);
791 877
878 if (sctx->is_dev_replace)
879 scrub_write_block_to_dev_replace(sblock_bad);
792 goto out; 880 goto out;
793 } 881 }
794 882
@@ -822,12 +910,15 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
822 BTRFS_DEV_STAT_CORRUPTION_ERRS); 910 BTRFS_DEV_STAT_CORRUPTION_ERRS);
823 } 911 }
824 912
825 if (sctx->readonly) 913 if (sctx->readonly && !sctx->is_dev_replace)
826 goto did_not_correct_error; 914 goto did_not_correct_error;
827 915
828 if (!is_metadata && !have_csum) { 916 if (!is_metadata && !have_csum) {
829 struct scrub_fixup_nodatasum *fixup_nodatasum; 917 struct scrub_fixup_nodatasum *fixup_nodatasum;
830 918
919nodatasum_case:
920 WARN_ON(sctx->is_dev_replace);
921
831 /* 922 /*
832 * !is_metadata and !have_csum, this means that the data 923 * !is_metadata and !have_csum, this means that the data
833 * might not be COW'ed, that it might be modified 924 * might not be COW'ed, that it might be modified
@@ -883,18 +974,79 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
883 if (!sblock_other->header_error && 974 if (!sblock_other->header_error &&
884 !sblock_other->checksum_error && 975 !sblock_other->checksum_error &&
885 sblock_other->no_io_error_seen) { 976 sblock_other->no_io_error_seen) {
886 int force_write = is_metadata || have_csum; 977 if (sctx->is_dev_replace) {
887 978 scrub_write_block_to_dev_replace(sblock_other);
888 ret = scrub_repair_block_from_good_copy(sblock_bad, 979 } else {
889 sblock_other, 980 int force_write = is_metadata || have_csum;
890 force_write); 981
982 ret = scrub_repair_block_from_good_copy(
983 sblock_bad, sblock_other,
984 force_write);
985 }
891 if (0 == ret) 986 if (0 == ret)
892 goto corrected_error; 987 goto corrected_error;
893 } 988 }
894 } 989 }
895 990
896 /* 991 /*
897 * in case of I/O errors in the area that is supposed to be 992 * for dev_replace, pick good pages and write to the target device.
993 */
994 if (sctx->is_dev_replace) {
995 success = 1;
996 for (page_num = 0; page_num < sblock_bad->page_count;
997 page_num++) {
998 int sub_success;
999
1000 sub_success = 0;
1001 for (mirror_index = 0;
1002 mirror_index < BTRFS_MAX_MIRRORS &&
1003 sblocks_for_recheck[mirror_index].page_count > 0;
1004 mirror_index++) {
1005 struct scrub_block *sblock_other =
1006 sblocks_for_recheck + mirror_index;
1007 struct scrub_page *page_other =
1008 sblock_other->pagev[page_num];
1009
1010 if (!page_other->io_error) {
1011 ret = scrub_write_page_to_dev_replace(
1012 sblock_other, page_num);
1013 if (ret == 0) {
1014 /* succeeded for this page */
1015 sub_success = 1;
1016 break;
1017 } else {
1018 btrfs_dev_replace_stats_inc(
1019 &sctx->dev_root->
1020 fs_info->dev_replace.
1021 num_write_errors);
1022 }
1023 }
1024 }
1025
1026 if (!sub_success) {
1027 /*
1028 * did not find a mirror to fetch the page
1029 * from. scrub_write_page_to_dev_replace()
1030 * handles this case (page->io_error), by
1031 * filling the block with zeros before
1032 * submitting the write request
1033 */
1034 success = 0;
1035 ret = scrub_write_page_to_dev_replace(
1036 sblock_bad, page_num);
1037 if (ret)
1038 btrfs_dev_replace_stats_inc(
1039 &sctx->dev_root->fs_info->
1040 dev_replace.num_write_errors);
1041 }
1042 }
1043
1044 goto out;
1045 }
1046
1047 /*
1048 * for regular scrub, repair those pages that are errored.
1049 * In case of I/O errors in the area that is supposed to be
898 * repaired, continue by picking good copies of those pages. 1050 * repaired, continue by picking good copies of those pages.
899 * Select the good pages from mirrors to rewrite bad pages from 1051 * Select the good pages from mirrors to rewrite bad pages from
900 * the area to fix. Afterwards verify the checksum of the block 1052 * the area to fix. Afterwards verify the checksum of the block
@@ -1017,6 +1169,7 @@ out:
1017 1169
1018static int scrub_setup_recheck_block(struct scrub_ctx *sctx, 1170static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1019 struct btrfs_fs_info *fs_info, 1171 struct btrfs_fs_info *fs_info,
1172 struct scrub_block *original_sblock,
1020 u64 length, u64 logical, 1173 u64 length, u64 logical,
1021 struct scrub_block *sblocks_for_recheck) 1174 struct scrub_block *sblocks_for_recheck)
1022{ 1175{
@@ -1047,7 +1200,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1047 return -EIO; 1200 return -EIO;
1048 } 1201 }
1049 1202
1050 BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); 1203 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1051 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1204 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1052 mirror_index++) { 1205 mirror_index++) {
1053 struct scrub_block *sblock; 1206 struct scrub_block *sblock;
@@ -1071,6 +1224,10 @@ leave_nomem:
1071 sblock->pagev[page_index] = page; 1224 sblock->pagev[page_index] = page;
1072 page->logical = logical; 1225 page->logical = logical;
1073 page->physical = bbio->stripes[mirror_index].physical; 1226 page->physical = bbio->stripes[mirror_index].physical;
1227 BUG_ON(page_index >= original_sblock->page_count);
1228 page->physical_for_dev_replace =
1229 original_sblock->pagev[page_index]->
1230 physical_for_dev_replace;
1074 /* for missing devices, dev->bdev is NULL */ 1231 /* for missing devices, dev->bdev is NULL */
1075 page->dev = bbio->stripes[mirror_index].dev; 1232 page->dev = bbio->stripes[mirror_index].dev;
1076 page->mirror_num = mirror_index + 1; 1233 page->mirror_num = mirror_index + 1;
@@ -1249,6 +1406,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1249 int ret; 1406 int ret;
1250 DECLARE_COMPLETION_ONSTACK(complete); 1407 DECLARE_COMPLETION_ONSTACK(complete);
1251 1408
1409 if (!page_bad->dev->bdev) {
1410 printk_ratelimited(KERN_WARNING
1411 "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1412 return -EIO;
1413 }
1414
1252 bio = bio_alloc(GFP_NOFS, 1); 1415 bio = bio_alloc(GFP_NOFS, 1);
1253 if (!bio) 1416 if (!bio)
1254 return -EIO; 1417 return -EIO;
@@ -1269,6 +1432,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1269 if (!bio_flagged(bio, BIO_UPTODATE)) { 1432 if (!bio_flagged(bio, BIO_UPTODATE)) {
1270 btrfs_dev_stat_inc_and_print(page_bad->dev, 1433 btrfs_dev_stat_inc_and_print(page_bad->dev,
1271 BTRFS_DEV_STAT_WRITE_ERRS); 1434 BTRFS_DEV_STAT_WRITE_ERRS);
1435 btrfs_dev_replace_stats_inc(
1436 &sblock_bad->sctx->dev_root->fs_info->
1437 dev_replace.num_write_errors);
1272 bio_put(bio); 1438 bio_put(bio);
1273 return -EIO; 1439 return -EIO;
1274 } 1440 }
@@ -1278,7 +1444,168 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1278 return 0; 1444 return 0;
1279} 1445}
1280 1446
1281static void scrub_checksum(struct scrub_block *sblock) 1447static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1448{
1449 int page_num;
1450
1451 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1452 int ret;
1453
1454 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1455 if (ret)
1456 btrfs_dev_replace_stats_inc(
1457 &sblock->sctx->dev_root->fs_info->dev_replace.
1458 num_write_errors);
1459 }
1460}
1461
1462static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1463 int page_num)
1464{
1465 struct scrub_page *spage = sblock->pagev[page_num];
1466
1467 BUG_ON(spage->page == NULL);
1468 if (spage->io_error) {
1469 void *mapped_buffer = kmap_atomic(spage->page);
1470
1471 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1472 flush_dcache_page(spage->page);
1473 kunmap_atomic(mapped_buffer);
1474 }
1475 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1476}
1477
1478static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1479 struct scrub_page *spage)
1480{
1481 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1482 struct scrub_bio *sbio;
1483 int ret;
1484
1485 mutex_lock(&wr_ctx->wr_lock);
1486again:
1487 if (!wr_ctx->wr_curr_bio) {
1488 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1489 GFP_NOFS);
1490 if (!wr_ctx->wr_curr_bio) {
1491 mutex_unlock(&wr_ctx->wr_lock);
1492 return -ENOMEM;
1493 }
1494 wr_ctx->wr_curr_bio->sctx = sctx;
1495 wr_ctx->wr_curr_bio->page_count = 0;
1496 }
1497 sbio = wr_ctx->wr_curr_bio;
1498 if (sbio->page_count == 0) {
1499 struct bio *bio;
1500
1501 sbio->physical = spage->physical_for_dev_replace;
1502 sbio->logical = spage->logical;
1503 sbio->dev = wr_ctx->tgtdev;
1504 bio = sbio->bio;
1505 if (!bio) {
1506 bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1507 if (!bio) {
1508 mutex_unlock(&wr_ctx->wr_lock);
1509 return -ENOMEM;
1510 }
1511 sbio->bio = bio;
1512 }
1513
1514 bio->bi_private = sbio;
1515 bio->bi_end_io = scrub_wr_bio_end_io;
1516 bio->bi_bdev = sbio->dev->bdev;
1517 bio->bi_sector = sbio->physical >> 9;
1518 sbio->err = 0;
1519 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1520 spage->physical_for_dev_replace ||
1521 sbio->logical + sbio->page_count * PAGE_SIZE !=
1522 spage->logical) {
1523 scrub_wr_submit(sctx);
1524 goto again;
1525 }
1526
1527 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1528 if (ret != PAGE_SIZE) {
1529 if (sbio->page_count < 1) {
1530 bio_put(sbio->bio);
1531 sbio->bio = NULL;
1532 mutex_unlock(&wr_ctx->wr_lock);
1533 return -EIO;
1534 }
1535 scrub_wr_submit(sctx);
1536 goto again;
1537 }
1538
1539 sbio->pagev[sbio->page_count] = spage;
1540 scrub_page_get(spage);
1541 sbio->page_count++;
1542 if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1543 scrub_wr_submit(sctx);
1544 mutex_unlock(&wr_ctx->wr_lock);
1545
1546 return 0;
1547}
1548
1549static void scrub_wr_submit(struct scrub_ctx *sctx)
1550{
1551 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1552 struct scrub_bio *sbio;
1553
1554 if (!wr_ctx->wr_curr_bio)
1555 return;
1556
1557 sbio = wr_ctx->wr_curr_bio;
1558 wr_ctx->wr_curr_bio = NULL;
1559 WARN_ON(!sbio->bio->bi_bdev);
1560 scrub_pending_bio_inc(sctx);
1561 /* process all writes in a single worker thread. Then the block layer
1562 * orders the requests before sending them to the driver which
1563 * doubled the write performance on spinning disks when measured
1564 * with Linux 3.5 */
1565 btrfsic_submit_bio(WRITE, sbio->bio);
1566}
1567
1568static void scrub_wr_bio_end_io(struct bio *bio, int err)
1569{
1570 struct scrub_bio *sbio = bio->bi_private;
1571 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1572
1573 sbio->err = err;
1574 sbio->bio = bio;
1575
1576 sbio->work.func = scrub_wr_bio_end_io_worker;
1577 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1578}
1579
1580static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1581{
1582 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1583 struct scrub_ctx *sctx = sbio->sctx;
1584 int i;
1585
1586 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1587 if (sbio->err) {
1588 struct btrfs_dev_replace *dev_replace =
1589 &sbio->sctx->dev_root->fs_info->dev_replace;
1590
1591 for (i = 0; i < sbio->page_count; i++) {
1592 struct scrub_page *spage = sbio->pagev[i];
1593
1594 spage->io_error = 1;
1595 btrfs_dev_replace_stats_inc(&dev_replace->
1596 num_write_errors);
1597 }
1598 }
1599
1600 for (i = 0; i < sbio->page_count; i++)
1601 scrub_page_put(sbio->pagev[i]);
1602
1603 bio_put(sbio->bio);
1604 kfree(sbio);
1605 scrub_pending_bio_dec(sctx);
1606}
1607
1608static int scrub_checksum(struct scrub_block *sblock)
1282{ 1609{
1283 u64 flags; 1610 u64 flags;
1284 int ret; 1611 int ret;
@@ -1296,6 +1623,8 @@ static void scrub_checksum(struct scrub_block *sblock)
1296 WARN_ON(1); 1623 WARN_ON(1);
1297 if (ret) 1624 if (ret)
1298 scrub_handle_errored_block(sblock); 1625 scrub_handle_errored_block(sblock);
1626
1627 return ret;
1299} 1628}
1300 1629
1301static int scrub_checksum_data(struct scrub_block *sblock) 1630static int scrub_checksum_data(struct scrub_block *sblock)
@@ -1386,7 +1715,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1386 BTRFS_UUID_SIZE)) 1715 BTRFS_UUID_SIZE))
1387 ++fail; 1716 ++fail;
1388 1717
1389 BUG_ON(sctx->nodesize != sctx->leafsize); 1718 WARN_ON(sctx->nodesize != sctx->leafsize);
1390 len = sctx->nodesize - BTRFS_CSUM_SIZE; 1719 len = sctx->nodesize - BTRFS_CSUM_SIZE;
1391 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1720 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1392 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1721 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
@@ -1534,11 +1863,24 @@ static void scrub_submit(struct scrub_ctx *sctx)
1534 sctx->curr = -1; 1863 sctx->curr = -1;
1535 scrub_pending_bio_inc(sctx); 1864 scrub_pending_bio_inc(sctx);
1536 1865
1537 btrfsic_submit_bio(READ, sbio->bio); 1866 if (!sbio->bio->bi_bdev) {
1867 /*
1868 * this case should not happen. If btrfs_map_block() is
1869 * wrong, it could happen for dev-replace operations on
1870 * missing devices when no mirrors are available, but in
1871 * this case it should already fail the mount.
1872 * This case is handled correctly (but _very_ slowly).
1873 */
1874 printk_ratelimited(KERN_WARNING
1875 "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1876 bio_endio(sbio->bio, -EIO);
1877 } else {
1878 btrfsic_submit_bio(READ, sbio->bio);
1879 }
1538} 1880}
1539 1881
1540static int scrub_add_page_to_bio(struct scrub_ctx *sctx, 1882static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1541 struct scrub_page *spage) 1883 struct scrub_page *spage)
1542{ 1884{
1543 struct scrub_block *sblock = spage->sblock; 1885 struct scrub_block *sblock = spage->sblock;
1544 struct scrub_bio *sbio; 1886 struct scrub_bio *sbio;
@@ -1570,7 +1912,7 @@ again:
1570 sbio->dev = spage->dev; 1912 sbio->dev = spage->dev;
1571 bio = sbio->bio; 1913 bio = sbio->bio;
1572 if (!bio) { 1914 if (!bio) {
1573 bio = bio_alloc(GFP_NOFS, sctx->pages_per_bio); 1915 bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1574 if (!bio) 1916 if (!bio)
1575 return -ENOMEM; 1917 return -ENOMEM;
1576 sbio->bio = bio; 1918 sbio->bio = bio;
@@ -1602,10 +1944,10 @@ again:
1602 goto again; 1944 goto again;
1603 } 1945 }
1604 1946
1605 scrub_block_get(sblock); /* one for the added page */ 1947 scrub_block_get(sblock); /* one for the page added to the bio */
1606 atomic_inc(&sblock->outstanding_pages); 1948 atomic_inc(&sblock->outstanding_pages);
1607 sbio->page_count++; 1949 sbio->page_count++;
1608 if (sbio->page_count == sctx->pages_per_bio) 1950 if (sbio->page_count == sctx->pages_per_rd_bio)
1609 scrub_submit(sctx); 1951 scrub_submit(sctx);
1610 1952
1611 return 0; 1953 return 0;
@@ -1613,7 +1955,8 @@ again:
1613 1955
1614static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 1956static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1615 u64 physical, struct btrfs_device *dev, u64 flags, 1957 u64 physical, struct btrfs_device *dev, u64 flags,
1616 u64 gen, int mirror_num, u8 *csum, int force) 1958 u64 gen, int mirror_num, u8 *csum, int force,
1959 u64 physical_for_dev_replace)
1617{ 1960{
1618 struct scrub_block *sblock; 1961 struct scrub_block *sblock;
1619 int index; 1962 int index;
@@ -1654,6 +1997,7 @@ leave_nomem:
1654 spage->generation = gen; 1997 spage->generation = gen;
1655 spage->logical = logical; 1998 spage->logical = logical;
1656 spage->physical = physical; 1999 spage->physical = physical;
2000 spage->physical_for_dev_replace = physical_for_dev_replace;
1657 spage->mirror_num = mirror_num; 2001 spage->mirror_num = mirror_num;
1658 if (csum) { 2002 if (csum) {
1659 spage->have_csum = 1; 2003 spage->have_csum = 1;
@@ -1668,6 +2012,7 @@ leave_nomem:
1668 len -= l; 2012 len -= l;
1669 logical += l; 2013 logical += l;
1670 physical += l; 2014 physical += l;
2015 physical_for_dev_replace += l;
1671 } 2016 }
1672 2017
1673 WARN_ON(sblock->page_count == 0); 2018 WARN_ON(sblock->page_count == 0);
@@ -1675,7 +2020,7 @@ leave_nomem:
1675 struct scrub_page *spage = sblock->pagev[index]; 2020 struct scrub_page *spage = sblock->pagev[index];
1676 int ret; 2021 int ret;
1677 2022
1678 ret = scrub_add_page_to_bio(sctx, spage); 2023 ret = scrub_add_page_to_rd_bio(sctx, spage);
1679 if (ret) { 2024 if (ret) {
1680 scrub_block_put(sblock); 2025 scrub_block_put(sblock);
1681 return ret; 2026 return ret;
@@ -1707,7 +2052,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
1707 struct scrub_ctx *sctx = sbio->sctx; 2052 struct scrub_ctx *sctx = sbio->sctx;
1708 int i; 2053 int i;
1709 2054
1710 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); 2055 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
1711 if (sbio->err) { 2056 if (sbio->err) {
1712 for (i = 0; i < sbio->page_count; i++) { 2057 for (i = 0; i < sbio->page_count; i++) {
1713 struct scrub_page *spage = sbio->pagev[i]; 2058 struct scrub_page *spage = sbio->pagev[i];
@@ -1733,15 +2078,30 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
1733 sbio->next_free = sctx->first_free; 2078 sbio->next_free = sctx->first_free;
1734 sctx->first_free = sbio->index; 2079 sctx->first_free = sbio->index;
1735 spin_unlock(&sctx->list_lock); 2080 spin_unlock(&sctx->list_lock);
2081
2082 if (sctx->is_dev_replace &&
2083 atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2084 mutex_lock(&sctx->wr_ctx.wr_lock);
2085 scrub_wr_submit(sctx);
2086 mutex_unlock(&sctx->wr_ctx.wr_lock);
2087 }
2088
1736 scrub_pending_bio_dec(sctx); 2089 scrub_pending_bio_dec(sctx);
1737} 2090}
1738 2091
1739static void scrub_block_complete(struct scrub_block *sblock) 2092static void scrub_block_complete(struct scrub_block *sblock)
1740{ 2093{
1741 if (!sblock->no_io_error_seen) 2094 if (!sblock->no_io_error_seen) {
1742 scrub_handle_errored_block(sblock); 2095 scrub_handle_errored_block(sblock);
1743 else 2096 } else {
1744 scrub_checksum(sblock); 2097 /*
2098 * if has checksum error, write via repair mechanism in
2099 * dev replace case, otherwise write here in dev replace
2100 * case.
2101 */
2102 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2103 scrub_write_block_to_dev_replace(sblock);
2104 }
1745} 2105}
1746 2106
1747static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, 2107static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -1786,7 +2146,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
1786/* scrub extent tries to collect up to 64 kB for each bio */ 2146/* scrub extent tries to collect up to 64 kB for each bio */
1787static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, 2147static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1788 u64 physical, struct btrfs_device *dev, u64 flags, 2148 u64 physical, struct btrfs_device *dev, u64 flags,
1789 u64 gen, int mirror_num) 2149 u64 gen, int mirror_num, u64 physical_for_dev_replace)
1790{ 2150{
1791 int ret; 2151 int ret;
1792 u8 csum[BTRFS_CSUM_SIZE]; 2152 u8 csum[BTRFS_CSUM_SIZE];
@@ -1799,7 +2159,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1799 sctx->stat.data_bytes_scrubbed += len; 2159 sctx->stat.data_bytes_scrubbed += len;
1800 spin_unlock(&sctx->stat_lock); 2160 spin_unlock(&sctx->stat_lock);
1801 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2161 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1802 BUG_ON(sctx->nodesize != sctx->leafsize); 2162 WARN_ON(sctx->nodesize != sctx->leafsize);
1803 blocksize = sctx->nodesize; 2163 blocksize = sctx->nodesize;
1804 spin_lock(&sctx->stat_lock); 2164 spin_lock(&sctx->stat_lock);
1805 sctx->stat.tree_extents_scrubbed++; 2165 sctx->stat.tree_extents_scrubbed++;
@@ -1807,7 +2167,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1807 spin_unlock(&sctx->stat_lock); 2167 spin_unlock(&sctx->stat_lock);
1808 } else { 2168 } else {
1809 blocksize = sctx->sectorsize; 2169 blocksize = sctx->sectorsize;
1810 BUG_ON(1); 2170 WARN_ON(1);
1811 } 2171 }
1812 2172
1813 while (len) { 2173 while (len) {
@@ -1819,14 +2179,23 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1819 have_csum = scrub_find_csum(sctx, logical, l, csum); 2179 have_csum = scrub_find_csum(sctx, logical, l, csum);
1820 if (have_csum == 0) 2180 if (have_csum == 0)
1821 ++sctx->stat.no_csum; 2181 ++sctx->stat.no_csum;
2182 if (sctx->is_dev_replace && !have_csum) {
2183 ret = copy_nocow_pages(sctx, logical, l,
2184 mirror_num,
2185 physical_for_dev_replace);
2186 goto behind_scrub_pages;
2187 }
1822 } 2188 }
1823 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, 2189 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
1824 mirror_num, have_csum ? csum : NULL, 0); 2190 mirror_num, have_csum ? csum : NULL, 0,
2191 physical_for_dev_replace);
2192behind_scrub_pages:
1825 if (ret) 2193 if (ret)
1826 return ret; 2194 return ret;
1827 len -= l; 2195 len -= l;
1828 logical += l; 2196 logical += l;
1829 physical += l; 2197 physical += l;
2198 physical_for_dev_replace += l;
1830 } 2199 }
1831 return 0; 2200 return 0;
1832} 2201}
@@ -1834,7 +2203,8 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1834static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2203static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1835 struct map_lookup *map, 2204 struct map_lookup *map,
1836 struct btrfs_device *scrub_dev, 2205 struct btrfs_device *scrub_dev,
1837 int num, u64 base, u64 length) 2206 int num, u64 base, u64 length,
2207 int is_dev_replace)
1838{ 2208{
1839 struct btrfs_path *path; 2209 struct btrfs_path *path;
1840 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 2210 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
@@ -1859,6 +2229,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1859 struct btrfs_key key_end; 2229 struct btrfs_key key_end;
1860 u64 increment = map->stripe_len; 2230 u64 increment = map->stripe_len;
1861 u64 offset; 2231 u64 offset;
2232 u64 extent_logical;
2233 u64 extent_physical;
2234 u64 extent_len;
2235 struct btrfs_device *extent_dev;
2236 int extent_mirror_num;
1862 2237
1863 nstripes = length; 2238 nstripes = length;
1864 offset = 0; 2239 offset = 0;
@@ -1966,9 +2341,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1966 */ 2341 */
1967 if (atomic_read(&fs_info->scrub_pause_req)) { 2342 if (atomic_read(&fs_info->scrub_pause_req)) {
1968 /* push queued extents */ 2343 /* push queued extents */
2344 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
1969 scrub_submit(sctx); 2345 scrub_submit(sctx);
2346 mutex_lock(&sctx->wr_ctx.wr_lock);
2347 scrub_wr_submit(sctx);
2348 mutex_unlock(&sctx->wr_ctx.wr_lock);
1970 wait_event(sctx->list_wait, 2349 wait_event(sctx->list_wait,
1971 atomic_read(&sctx->bios_in_flight) == 0); 2350 atomic_read(&sctx->bios_in_flight) == 0);
2351 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
1972 atomic_inc(&fs_info->scrubs_paused); 2352 atomic_inc(&fs_info->scrubs_paused);
1973 wake_up(&fs_info->scrub_pause_wait); 2353 wake_up(&fs_info->scrub_pause_wait);
1974 mutex_lock(&fs_info->scrub_lock); 2354 mutex_lock(&fs_info->scrub_lock);
@@ -2063,10 +2443,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2063 key.objectid; 2443 key.objectid;
2064 } 2444 }
2065 2445
2066 ret = scrub_extent(sctx, key.objectid, key.offset, 2446 extent_logical = key.objectid;
2067 key.objectid - logical + physical, 2447 extent_physical = key.objectid - logical + physical;
2068 scrub_dev, flags, generation, 2448 extent_len = key.offset;
2069 mirror_num); 2449 extent_dev = scrub_dev;
2450 extent_mirror_num = mirror_num;
2451 if (is_dev_replace)
2452 scrub_remap_extent(fs_info, extent_logical,
2453 extent_len, &extent_physical,
2454 &extent_dev,
2455 &extent_mirror_num);
2456 ret = scrub_extent(sctx, extent_logical, extent_len,
2457 extent_physical, extent_dev, flags,
2458 generation, extent_mirror_num,
2459 key.objectid - logical + physical);
2070 if (ret) 2460 if (ret)
2071 goto out; 2461 goto out;
2072 2462
@@ -2080,10 +2470,13 @@ next:
2080 sctx->stat.last_physical = physical; 2470 sctx->stat.last_physical = physical;
2081 spin_unlock(&sctx->stat_lock); 2471 spin_unlock(&sctx->stat_lock);
2082 } 2472 }
2473out:
2083 /* push queued extents */ 2474 /* push queued extents */
2084 scrub_submit(sctx); 2475 scrub_submit(sctx);
2476 mutex_lock(&sctx->wr_ctx.wr_lock);
2477 scrub_wr_submit(sctx);
2478 mutex_unlock(&sctx->wr_ctx.wr_lock);
2085 2479
2086out:
2087 blk_finish_plug(&plug); 2480 blk_finish_plug(&plug);
2088 btrfs_free_path(path); 2481 btrfs_free_path(path);
2089 return ret < 0 ? ret : 0; 2482 return ret < 0 ? ret : 0;
@@ -2093,14 +2486,14 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2093 struct btrfs_device *scrub_dev, 2486 struct btrfs_device *scrub_dev,
2094 u64 chunk_tree, u64 chunk_objectid, 2487 u64 chunk_tree, u64 chunk_objectid,
2095 u64 chunk_offset, u64 length, 2488 u64 chunk_offset, u64 length,
2096 u64 dev_offset) 2489 u64 dev_offset, int is_dev_replace)
2097{ 2490{
2098 struct btrfs_mapping_tree *map_tree = 2491 struct btrfs_mapping_tree *map_tree =
2099 &sctx->dev_root->fs_info->mapping_tree; 2492 &sctx->dev_root->fs_info->mapping_tree;
2100 struct map_lookup *map; 2493 struct map_lookup *map;
2101 struct extent_map *em; 2494 struct extent_map *em;
2102 int i; 2495 int i;
2103 int ret = -EINVAL; 2496 int ret = 0;
2104 2497
2105 read_lock(&map_tree->map_tree.lock); 2498 read_lock(&map_tree->map_tree.lock);
2106 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2499 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -2120,7 +2513,8 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2120 if (map->stripes[i].dev->bdev == scrub_dev->bdev && 2513 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2121 map->stripes[i].physical == dev_offset) { 2514 map->stripes[i].physical == dev_offset) {
2122 ret = scrub_stripe(sctx, map, scrub_dev, i, 2515 ret = scrub_stripe(sctx, map, scrub_dev, i,
2123 chunk_offset, length); 2516 chunk_offset, length,
2517 is_dev_replace);
2124 if (ret) 2518 if (ret)
2125 goto out; 2519 goto out;
2126 } 2520 }
@@ -2133,7 +2527,8 @@ out:
2133 2527
2134static noinline_for_stack 2528static noinline_for_stack
2135int scrub_enumerate_chunks(struct scrub_ctx *sctx, 2529int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2136 struct btrfs_device *scrub_dev, u64 start, u64 end) 2530 struct btrfs_device *scrub_dev, u64 start, u64 end,
2531 int is_dev_replace)
2137{ 2532{
2138 struct btrfs_dev_extent *dev_extent = NULL; 2533 struct btrfs_dev_extent *dev_extent = NULL;
2139 struct btrfs_path *path; 2534 struct btrfs_path *path;
@@ -2149,6 +2544,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2149 struct btrfs_key key; 2544 struct btrfs_key key;
2150 struct btrfs_key found_key; 2545 struct btrfs_key found_key;
2151 struct btrfs_block_group_cache *cache; 2546 struct btrfs_block_group_cache *cache;
2547 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2152 2548
2153 path = btrfs_alloc_path(); 2549 path = btrfs_alloc_path();
2154 if (!path) 2550 if (!path)
@@ -2214,11 +2610,61 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2214 ret = -ENOENT; 2610 ret = -ENOENT;
2215 break; 2611 break;
2216 } 2612 }
2613 dev_replace->cursor_right = found_key.offset + length;
2614 dev_replace->cursor_left = found_key.offset;
2615 dev_replace->item_needs_writeback = 1;
2217 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, 2616 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2218 chunk_offset, length, found_key.offset); 2617 chunk_offset, length, found_key.offset,
2618 is_dev_replace);
2619
2620 /*
2621 * flush, submit all pending read and write bios, afterwards
2622 * wait for them.
2623 * Note that in the dev replace case, a read request causes
2624 * write requests that are submitted in the read completion
2625 * worker. Therefore in the current situation, it is required
2626 * that all write requests are flushed, so that all read and
2627 * write requests are really completed when bios_in_flight
2628 * changes to 0.
2629 */
2630 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2631 scrub_submit(sctx);
2632 mutex_lock(&sctx->wr_ctx.wr_lock);
2633 scrub_wr_submit(sctx);
2634 mutex_unlock(&sctx->wr_ctx.wr_lock);
2635
2636 wait_event(sctx->list_wait,
2637 atomic_read(&sctx->bios_in_flight) == 0);
2638 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2639 atomic_inc(&fs_info->scrubs_paused);
2640 wake_up(&fs_info->scrub_pause_wait);
2641 wait_event(sctx->list_wait,
2642 atomic_read(&sctx->workers_pending) == 0);
2643
2644 mutex_lock(&fs_info->scrub_lock);
2645 while (atomic_read(&fs_info->scrub_pause_req)) {
2646 mutex_unlock(&fs_info->scrub_lock);
2647 wait_event(fs_info->scrub_pause_wait,
2648 atomic_read(&fs_info->scrub_pause_req) == 0);
2649 mutex_lock(&fs_info->scrub_lock);
2650 }
2651 atomic_dec(&fs_info->scrubs_paused);
2652 mutex_unlock(&fs_info->scrub_lock);
2653 wake_up(&fs_info->scrub_pause_wait);
2654
2655 dev_replace->cursor_left = dev_replace->cursor_right;
2656 dev_replace->item_needs_writeback = 1;
2219 btrfs_put_block_group(cache); 2657 btrfs_put_block_group(cache);
2220 if (ret) 2658 if (ret)
2221 break; 2659 break;
2660 if (atomic64_read(&dev_replace->num_write_errors) > 0) {
2661 ret = -EIO;
2662 break;
2663 }
2664 if (sctx->stat.malloc_errors > 0) {
2665 ret = -ENOMEM;
2666 break;
2667 }
2222 2668
2223 key.offset = found_key.offset + length; 2669 key.offset = found_key.offset + length;
2224 btrfs_release_path(path); 2670 btrfs_release_path(path);
@@ -2254,7 +2700,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2254 2700
2255 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2701 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2256 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, 2702 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2257 NULL, 1); 2703 NULL, 1, bytenr);
2258 if (ret) 2704 if (ret)
2259 return ret; 2705 return ret;
2260 } 2706 }
@@ -2266,18 +2712,38 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2266/* 2712/*
2267 * get a reference count on fs_info->scrub_workers. start worker if necessary 2713 * get a reference count on fs_info->scrub_workers. start worker if necessary
2268 */ 2714 */
2269static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) 2715static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2716 int is_dev_replace)
2270{ 2717{
2271 int ret = 0; 2718 int ret = 0;
2272 2719
2273 mutex_lock(&fs_info->scrub_lock); 2720 mutex_lock(&fs_info->scrub_lock);
2274 if (fs_info->scrub_workers_refcnt == 0) { 2721 if (fs_info->scrub_workers_refcnt == 0) {
2275 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2722 if (is_dev_replace)
2276 fs_info->thread_pool_size, &fs_info->generic_worker); 2723 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2724 &fs_info->generic_worker);
2725 else
2726 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2727 fs_info->thread_pool_size,
2728 &fs_info->generic_worker);
2277 fs_info->scrub_workers.idle_thresh = 4; 2729 fs_info->scrub_workers.idle_thresh = 4;
2278 ret = btrfs_start_workers(&fs_info->scrub_workers); 2730 ret = btrfs_start_workers(&fs_info->scrub_workers);
2279 if (ret) 2731 if (ret)
2280 goto out; 2732 goto out;
2733 btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2734 "scrubwrc",
2735 fs_info->thread_pool_size,
2736 &fs_info->generic_worker);
2737 fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2738 ret = btrfs_start_workers(
2739 &fs_info->scrub_wr_completion_workers);
2740 if (ret)
2741 goto out;
2742 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2743 &fs_info->generic_worker);
2744 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2745 if (ret)
2746 goto out;
2281 } 2747 }
2282 ++fs_info->scrub_workers_refcnt; 2748 ++fs_info->scrub_workers_refcnt;
2283out: 2749out:
@@ -2289,8 +2755,11 @@ out:
2289static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) 2755static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2290{ 2756{
2291 mutex_lock(&fs_info->scrub_lock); 2757 mutex_lock(&fs_info->scrub_lock);
2292 if (--fs_info->scrub_workers_refcnt == 0) 2758 if (--fs_info->scrub_workers_refcnt == 0) {
2293 btrfs_stop_workers(&fs_info->scrub_workers); 2759 btrfs_stop_workers(&fs_info->scrub_workers);
2760 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2761 btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2762 }
2294 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2763 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2295 mutex_unlock(&fs_info->scrub_lock); 2764 mutex_unlock(&fs_info->scrub_lock);
2296} 2765}
@@ -2354,7 +2823,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2354 return -EINVAL; 2823 return -EINVAL;
2355 } 2824 }
2356 2825
2357 ret = scrub_workers_get(fs_info); 2826 ret = scrub_workers_get(fs_info, is_dev_replace);
2358 if (ret) 2827 if (ret)
2359 return ret; 2828 return ret;
2360 2829
@@ -2394,12 +2863,15 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2394 mutex_unlock(&fs_info->scrub_lock); 2863 mutex_unlock(&fs_info->scrub_lock);
2395 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2864 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2396 2865
2397 down_read(&fs_info->scrub_super_lock); 2866 if (!is_dev_replace) {
2398 ret = scrub_supers(sctx, dev); 2867 down_read(&fs_info->scrub_super_lock);
2399 up_read(&fs_info->scrub_super_lock); 2868 ret = scrub_supers(sctx, dev);
2869 up_read(&fs_info->scrub_super_lock);
2870 }
2400 2871
2401 if (!ret) 2872 if (!ret)
2402 ret = scrub_enumerate_chunks(sctx, dev, start, end); 2873 ret = scrub_enumerate_chunks(sctx, dev, start, end,
2874 is_dev_replace);
2403 2875
2404 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 2876 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2405 atomic_dec(&fs_info->scrubs_running); 2877 atomic_dec(&fs_info->scrubs_running);
@@ -2537,3 +3009,272 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2537 3009
2538 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 3010 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
2539} 3011}
3012
3013static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3014 u64 extent_logical, u64 extent_len,
3015 u64 *extent_physical,
3016 struct btrfs_device **extent_dev,
3017 int *extent_mirror_num)
3018{
3019 u64 mapped_length;
3020 struct btrfs_bio *bbio = NULL;
3021 int ret;
3022
3023 mapped_length = extent_len;
3024 ret = btrfs_map_block(fs_info, READ, extent_logical,
3025 &mapped_length, &bbio, 0);
3026 if (ret || !bbio || mapped_length < extent_len ||
3027 !bbio->stripes[0].dev->bdev) {
3028 kfree(bbio);
3029 return;
3030 }
3031
3032 *extent_physical = bbio->stripes[0].physical;
3033 *extent_mirror_num = bbio->mirror_num;
3034 *extent_dev = bbio->stripes[0].dev;
3035 kfree(bbio);
3036}
3037
3038static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3039 struct scrub_wr_ctx *wr_ctx,
3040 struct btrfs_fs_info *fs_info,
3041 struct btrfs_device *dev,
3042 int is_dev_replace)
3043{
3044 WARN_ON(wr_ctx->wr_curr_bio != NULL);
3045
3046 mutex_init(&wr_ctx->wr_lock);
3047 wr_ctx->wr_curr_bio = NULL;
3048 if (!is_dev_replace)
3049 return 0;
3050
3051 WARN_ON(!dev->bdev);
3052 wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3053 bio_get_nr_vecs(dev->bdev));
3054 wr_ctx->tgtdev = dev;
3055 atomic_set(&wr_ctx->flush_all_writes, 0);
3056 return 0;
3057}
3058
3059static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3060{
3061 mutex_lock(&wr_ctx->wr_lock);
3062 kfree(wr_ctx->wr_curr_bio);
3063 wr_ctx->wr_curr_bio = NULL;
3064 mutex_unlock(&wr_ctx->wr_lock);
3065}
3066
3067static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3068 int mirror_num, u64 physical_for_dev_replace)
3069{
3070 struct scrub_copy_nocow_ctx *nocow_ctx;
3071 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3072
3073 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3074 if (!nocow_ctx) {
3075 spin_lock(&sctx->stat_lock);
3076 sctx->stat.malloc_errors++;
3077 spin_unlock(&sctx->stat_lock);
3078 return -ENOMEM;
3079 }
3080
3081 scrub_pending_trans_workers_inc(sctx);
3082
3083 nocow_ctx->sctx = sctx;
3084 nocow_ctx->logical = logical;
3085 nocow_ctx->len = len;
3086 nocow_ctx->mirror_num = mirror_num;
3087 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3088 nocow_ctx->work.func = copy_nocow_pages_worker;
3089 btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3090 &nocow_ctx->work);
3091
3092 return 0;
3093}
3094
3095static void copy_nocow_pages_worker(struct btrfs_work *work)
3096{
3097 struct scrub_copy_nocow_ctx *nocow_ctx =
3098 container_of(work, struct scrub_copy_nocow_ctx, work);
3099 struct scrub_ctx *sctx = nocow_ctx->sctx;
3100 u64 logical = nocow_ctx->logical;
3101 u64 len = nocow_ctx->len;
3102 int mirror_num = nocow_ctx->mirror_num;
3103 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3104 int ret;
3105 struct btrfs_trans_handle *trans = NULL;
3106 struct btrfs_fs_info *fs_info;
3107 struct btrfs_path *path;
3108 struct btrfs_root *root;
3109 int not_written = 0;
3110
3111 fs_info = sctx->dev_root->fs_info;
3112 root = fs_info->extent_root;
3113
3114 path = btrfs_alloc_path();
3115 if (!path) {
3116 spin_lock(&sctx->stat_lock);
3117 sctx->stat.malloc_errors++;
3118 spin_unlock(&sctx->stat_lock);
3119 not_written = 1;
3120 goto out;
3121 }
3122
3123 trans = btrfs_join_transaction(root);
3124 if (IS_ERR(trans)) {
3125 not_written = 1;
3126 goto out;
3127 }
3128
3129 ret = iterate_inodes_from_logical(logical, fs_info, path,
3130 copy_nocow_pages_for_inode,
3131 nocow_ctx);
3132 if (ret != 0 && ret != -ENOENT) {
3133 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
3134 (unsigned long long)logical,
3135 (unsigned long long)physical_for_dev_replace,
3136 (unsigned long long)len,
3137 (unsigned long long)mirror_num, ret);
3138 not_written = 1;
3139 goto out;
3140 }
3141
3142out:
3143 if (trans && !IS_ERR(trans))
3144 btrfs_end_transaction(trans, root);
3145 if (not_written)
3146 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3147 num_uncorrectable_read_errors);
3148
3149 btrfs_free_path(path);
3150 kfree(nocow_ctx);
3151
3152 scrub_pending_trans_workers_dec(sctx);
3153}
3154
3155static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3156{
3157 unsigned long index;
3158 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3159 int ret = 0;
3160 struct btrfs_key key;
3161 struct inode *inode = NULL;
3162 struct btrfs_root *local_root;
3163 u64 physical_for_dev_replace;
3164 u64 len;
3165 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3166
3167 key.objectid = root;
3168 key.type = BTRFS_ROOT_ITEM_KEY;
3169 key.offset = (u64)-1;
3170 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3171 if (IS_ERR(local_root))
3172 return PTR_ERR(local_root);
3173
3174 key.type = BTRFS_INODE_ITEM_KEY;
3175 key.objectid = inum;
3176 key.offset = 0;
3177 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3178 if (IS_ERR(inode))
3179 return PTR_ERR(inode);
3180
3181 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3182 len = nocow_ctx->len;
3183 while (len >= PAGE_CACHE_SIZE) {
3184 struct page *page = NULL;
3185 int ret_sub;
3186
3187 index = offset >> PAGE_CACHE_SHIFT;
3188
3189 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3190 if (!page) {
3191 pr_err("find_or_create_page() failed\n");
3192 ret = -ENOMEM;
3193 goto next_page;
3194 }
3195
3196 if (PageUptodate(page)) {
3197 if (PageDirty(page))
3198 goto next_page;
3199 } else {
3200 ClearPageError(page);
3201 ret_sub = extent_read_full_page(&BTRFS_I(inode)->
3202 io_tree,
3203 page, btrfs_get_extent,
3204 nocow_ctx->mirror_num);
3205 if (ret_sub) {
3206 ret = ret_sub;
3207 goto next_page;
3208 }
3209 wait_on_page_locked(page);
3210 if (!PageUptodate(page)) {
3211 ret = -EIO;
3212 goto next_page;
3213 }
3214 }
3215 ret_sub = write_page_nocow(nocow_ctx->sctx,
3216 physical_for_dev_replace, page);
3217 if (ret_sub) {
3218 ret = ret_sub;
3219 goto next_page;
3220 }
3221
3222next_page:
3223 if (page) {
3224 unlock_page(page);
3225 put_page(page);
3226 }
3227 offset += PAGE_CACHE_SIZE;
3228 physical_for_dev_replace += PAGE_CACHE_SIZE;
3229 len -= PAGE_CACHE_SIZE;
3230 }
3231
3232 if (inode)
3233 iput(inode);
3234 return ret;
3235}
3236
3237static int write_page_nocow(struct scrub_ctx *sctx,
3238 u64 physical_for_dev_replace, struct page *page)
3239{
3240 struct bio *bio;
3241 struct btrfs_device *dev;
3242 int ret;
3243 DECLARE_COMPLETION_ONSTACK(compl);
3244
3245 dev = sctx->wr_ctx.tgtdev;
3246 if (!dev)
3247 return -EIO;
3248 if (!dev->bdev) {
3249 printk_ratelimited(KERN_WARNING
3250 "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3251 return -EIO;
3252 }
3253 bio = bio_alloc(GFP_NOFS, 1);
3254 if (!bio) {
3255 spin_lock(&sctx->stat_lock);
3256 sctx->stat.malloc_errors++;
3257 spin_unlock(&sctx->stat_lock);
3258 return -ENOMEM;
3259 }
3260 bio->bi_private = &compl;
3261 bio->bi_end_io = scrub_complete_bio_end_io;
3262 bio->bi_size = 0;
3263 bio->bi_sector = physical_for_dev_replace >> 9;
3264 bio->bi_bdev = dev->bdev;
3265 ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3266 if (ret != PAGE_CACHE_SIZE) {
3267leave_with_eio:
3268 bio_put(bio);
3269 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3270 return -EIO;
3271 }
3272 btrfsic_submit_bio(WRITE_SYNC, bio);
3273 wait_for_completion(&compl);
3274
3275 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
3276 goto leave_with_eio;
3277
3278 bio_put(bio);
3279 return 0;
3280}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 837ad2d27853..ad4380684b9b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1195,7 +1195,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1195 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1195 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
1196 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1196 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
1197 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1197 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
1198 btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); 1198 btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
1199 new_pool_size);
1199} 1200}
1200 1201
1201static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1202static int btrfs_remount(struct super_block *sb, int *flags, char *data)