aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/scrub.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r--fs/btrfs/scrub.c1836
1 files changed, 1333 insertions, 503 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 27892f67e69b..bdbb94f245c9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2011 STRATO. All rights reserved. 2 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 5 * modify it under the terms of the GNU General Public
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "backref.h" 26#include "backref.h"
27#include "extent_io.h" 27#include "extent_io.h"
28#include "dev-replace.h"
28#include "check-integrity.h" 29#include "check-integrity.h"
29#include "rcu-string.h" 30#include "rcu-string.h"
30 31
@@ -42,10 +43,23 @@
42 */ 43 */
43 44
44struct scrub_block; 45struct scrub_block;
45struct scrub_dev; 46struct scrub_ctx;
46 47
47#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 48/*
48#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ 49 * the following three values only influence the performance.
50 * The last one configures the number of parallel and outstanding I/O
51 * operations. The first two values configure an upper limit for the number
52 * of (dynamically allocated) pages that are added to a bio.
53 */
54#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
55#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
56#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
57
58/*
59 * the following value times PAGE_SIZE needs to be large enough to match the
60 * largest node/leaf/sector size that shall be supported.
61 * Values larger than BTRFS_STRIPE_LEN are not supported.
62 */
49#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 63#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
50 64
51struct scrub_page { 65struct scrub_page {
@@ -56,6 +70,8 @@ struct scrub_page {
56 u64 generation; 70 u64 generation;
57 u64 logical; 71 u64 logical;
58 u64 physical; 72 u64 physical;
73 u64 physical_for_dev_replace;
74 atomic_t ref_count;
59 struct { 75 struct {
60 unsigned int mirror_num:8; 76 unsigned int mirror_num:8;
61 unsigned int have_csum:1; 77 unsigned int have_csum:1;
@@ -66,23 +82,28 @@ struct scrub_page {
66 82
67struct scrub_bio { 83struct scrub_bio {
68 int index; 84 int index;
69 struct scrub_dev *sdev; 85 struct scrub_ctx *sctx;
86 struct btrfs_device *dev;
70 struct bio *bio; 87 struct bio *bio;
71 int err; 88 int err;
72 u64 logical; 89 u64 logical;
73 u64 physical; 90 u64 physical;
74 struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; 91#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
93#else
94 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
95#endif
75 int page_count; 96 int page_count;
76 int next_free; 97 int next_free;
77 struct btrfs_work work; 98 struct btrfs_work work;
78}; 99};
79 100
80struct scrub_block { 101struct scrub_block {
81 struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 102 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
82 int page_count; 103 int page_count;
83 atomic_t outstanding_pages; 104 atomic_t outstanding_pages;
84 atomic_t ref_count; /* free mem on transition to zero */ 105 atomic_t ref_count; /* free mem on transition to zero */
85 struct scrub_dev *sdev; 106 struct scrub_ctx *sctx;
86 struct { 107 struct {
87 unsigned int header_error:1; 108 unsigned int header_error:1;
88 unsigned int checksum_error:1; 109 unsigned int checksum_error:1;
@@ -91,23 +112,35 @@ struct scrub_block {
91 }; 112 };
92}; 113};
93 114
94struct scrub_dev { 115struct scrub_wr_ctx {
95 struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; 116 struct scrub_bio *wr_curr_bio;
96 struct btrfs_device *dev; 117 struct btrfs_device *tgtdev;
118 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
119 atomic_t flush_all_writes;
120 struct mutex wr_lock;
121};
122
123struct scrub_ctx {
124 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
125 struct btrfs_root *dev_root;
97 int first_free; 126 int first_free;
98 int curr; 127 int curr;
99 atomic_t in_flight; 128 atomic_t bios_in_flight;
100 atomic_t fixup_cnt; 129 atomic_t workers_pending;
101 spinlock_t list_lock; 130 spinlock_t list_lock;
102 wait_queue_head_t list_wait; 131 wait_queue_head_t list_wait;
103 u16 csum_size; 132 u16 csum_size;
104 struct list_head csum_list; 133 struct list_head csum_list;
105 atomic_t cancel_req; 134 atomic_t cancel_req;
106 int readonly; 135 int readonly;
107 int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ 136 int pages_per_rd_bio;
108 u32 sectorsize; 137 u32 sectorsize;
109 u32 nodesize; 138 u32 nodesize;
110 u32 leafsize; 139 u32 leafsize;
140
141 int is_dev_replace;
142 struct scrub_wr_ctx wr_ctx;
143
111 /* 144 /*
112 * statistics 145 * statistics
113 */ 146 */
@@ -116,13 +149,23 @@ struct scrub_dev {
116}; 149};
117 150
118struct scrub_fixup_nodatasum { 151struct scrub_fixup_nodatasum {
119 struct scrub_dev *sdev; 152 struct scrub_ctx *sctx;
153 struct btrfs_device *dev;
120 u64 logical; 154 u64 logical;
121 struct btrfs_root *root; 155 struct btrfs_root *root;
122 struct btrfs_work work; 156 struct btrfs_work work;
123 int mirror_num; 157 int mirror_num;
124}; 158};
125 159
160struct scrub_copy_nocow_ctx {
161 struct scrub_ctx *sctx;
162 u64 logical;
163 u64 len;
164 int mirror_num;
165 u64 physical_for_dev_replace;
166 struct btrfs_work work;
167};
168
126struct scrub_warning { 169struct scrub_warning {
127 struct btrfs_path *path; 170 struct btrfs_path *path;
128 u64 extent_item_size; 171 u64 extent_item_size;
@@ -137,15 +180,20 @@ struct scrub_warning {
137}; 180};
138 181
139 182
183static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
184static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
185static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
186static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
140static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 187static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
141static int scrub_setup_recheck_block(struct scrub_dev *sdev, 188static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
142 struct btrfs_mapping_tree *map_tree, 189 struct btrfs_fs_info *fs_info,
190 struct scrub_block *original_sblock,
143 u64 length, u64 logical, 191 u64 length, u64 logical,
144 struct scrub_block *sblock); 192 struct scrub_block *sblocks_for_recheck);
145static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 193static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
146 struct scrub_block *sblock, int is_metadata, 194 struct scrub_block *sblock, int is_metadata,
147 int have_csum, u8 *csum, u64 generation, 195 int have_csum, u8 *csum, u64 generation,
148 u16 csum_size); 196 u16 csum_size);
149static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 197static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
150 struct scrub_block *sblock, 198 struct scrub_block *sblock,
151 int is_metadata, int have_csum, 199 int is_metadata, int have_csum,
@@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
158static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 206static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
159 struct scrub_block *sblock_good, 207 struct scrub_block *sblock_good,
160 int page_num, int force_write); 208 int page_num, int force_write);
209static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
210static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
211 int page_num);
161static int scrub_checksum_data(struct scrub_block *sblock); 212static int scrub_checksum_data(struct scrub_block *sblock);
162static int scrub_checksum_tree_block(struct scrub_block *sblock); 213static int scrub_checksum_tree_block(struct scrub_block *sblock);
163static int scrub_checksum_super(struct scrub_block *sblock); 214static int scrub_checksum_super(struct scrub_block *sblock);
164static void scrub_block_get(struct scrub_block *sblock); 215static void scrub_block_get(struct scrub_block *sblock);
165static void scrub_block_put(struct scrub_block *sblock); 216static void scrub_block_put(struct scrub_block *sblock);
166static int scrub_add_page_to_bio(struct scrub_dev *sdev, 217static void scrub_page_get(struct scrub_page *spage);
167 struct scrub_page *spage); 218static void scrub_page_put(struct scrub_page *spage);
168static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 219static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
169 u64 physical, u64 flags, u64 gen, int mirror_num, 220 struct scrub_page *spage);
170 u8 *csum, int force); 221static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
222 u64 physical, struct btrfs_device *dev, u64 flags,
223 u64 gen, int mirror_num, u8 *csum, int force,
224 u64 physical_for_dev_replace);
171static void scrub_bio_end_io(struct bio *bio, int err); 225static void scrub_bio_end_io(struct bio *bio, int err);
172static void scrub_bio_end_io_worker(struct btrfs_work *work); 226static void scrub_bio_end_io_worker(struct btrfs_work *work);
173static void scrub_block_complete(struct scrub_block *sblock); 227static void scrub_block_complete(struct scrub_block *sblock);
228static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
229 u64 extent_logical, u64 extent_len,
230 u64 *extent_physical,
231 struct btrfs_device **extent_dev,
232 int *extent_mirror_num);
233static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
234 struct scrub_wr_ctx *wr_ctx,
235 struct btrfs_fs_info *fs_info,
236 struct btrfs_device *dev,
237 int is_dev_replace);
238static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
239static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
240 struct scrub_page *spage);
241static void scrub_wr_submit(struct scrub_ctx *sctx);
242static void scrub_wr_bio_end_io(struct bio *bio, int err);
243static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
244static int write_page_nocow(struct scrub_ctx *sctx,
245 u64 physical_for_dev_replace, struct page *page);
246static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
247 void *ctx);
248static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
249 int mirror_num, u64 physical_for_dev_replace);
250static void copy_nocow_pages_worker(struct btrfs_work *work);
251
252
253static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
254{
255 atomic_inc(&sctx->bios_in_flight);
256}
257
258static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
259{
260 atomic_dec(&sctx->bios_in_flight);
261 wake_up(&sctx->list_wait);
262}
263
264/*
265 * used for workers that require transaction commits (i.e., for the
266 * NOCOW case)
267 */
268static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
269{
270 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
271
272 /*
273 * increment scrubs_running to prevent cancel requests from
274 * completing as long as a worker is running. we must also
275 * increment scrubs_paused to prevent deadlocking on pause
276 * requests used for transactions commits (as the worker uses a
277 * transaction context). it is safe to regard the worker
278 * as paused for all matters practical. effectively, we only
279 * avoid cancellation requests from completing.
280 */
281 mutex_lock(&fs_info->scrub_lock);
282 atomic_inc(&fs_info->scrubs_running);
283 atomic_inc(&fs_info->scrubs_paused);
284 mutex_unlock(&fs_info->scrub_lock);
285 atomic_inc(&sctx->workers_pending);
286}
174 287
288/* used for workers that require transaction commits */
289static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
290{
291 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
175 292
176static void scrub_free_csums(struct scrub_dev *sdev) 293 /*
294 * see scrub_pending_trans_workers_inc() why we're pretending
295 * to be paused in the scrub counters
296 */
297 mutex_lock(&fs_info->scrub_lock);
298 atomic_dec(&fs_info->scrubs_running);
299 atomic_dec(&fs_info->scrubs_paused);
300 mutex_unlock(&fs_info->scrub_lock);
301 atomic_dec(&sctx->workers_pending);
302 wake_up(&fs_info->scrub_pause_wait);
303 wake_up(&sctx->list_wait);
304}
305
306static void scrub_free_csums(struct scrub_ctx *sctx)
177{ 307{
178 while (!list_empty(&sdev->csum_list)) { 308 while (!list_empty(&sctx->csum_list)) {
179 struct btrfs_ordered_sum *sum; 309 struct btrfs_ordered_sum *sum;
180 sum = list_first_entry(&sdev->csum_list, 310 sum = list_first_entry(&sctx->csum_list,
181 struct btrfs_ordered_sum, list); 311 struct btrfs_ordered_sum, list);
182 list_del(&sum->list); 312 list_del(&sum->list);
183 kfree(sum); 313 kfree(sum);
184 } 314 }
185} 315}
186 316
187static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) 317static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
188{ 318{
189 int i; 319 int i;
190 320
191 if (!sdev) 321 if (!sctx)
192 return; 322 return;
193 323
324 scrub_free_wr_ctx(&sctx->wr_ctx);
325
194 /* this can happen when scrub is cancelled */ 326 /* this can happen when scrub is cancelled */
195 if (sdev->curr != -1) { 327 if (sctx->curr != -1) {
196 struct scrub_bio *sbio = sdev->bios[sdev->curr]; 328 struct scrub_bio *sbio = sctx->bios[sctx->curr];
197 329
198 for (i = 0; i < sbio->page_count; i++) { 330 for (i = 0; i < sbio->page_count; i++) {
199 BUG_ON(!sbio->pagev[i]); 331 WARN_ON(!sbio->pagev[i]->page);
200 BUG_ON(!sbio->pagev[i]->page);
201 scrub_block_put(sbio->pagev[i]->sblock); 332 scrub_block_put(sbio->pagev[i]->sblock);
202 } 333 }
203 bio_put(sbio->bio); 334 bio_put(sbio->bio);
204 } 335 }
205 336
206 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 337 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
207 struct scrub_bio *sbio = sdev->bios[i]; 338 struct scrub_bio *sbio = sctx->bios[i];
208 339
209 if (!sbio) 340 if (!sbio)
210 break; 341 break;
211 kfree(sbio); 342 kfree(sbio);
212 } 343 }
213 344
214 scrub_free_csums(sdev); 345 scrub_free_csums(sctx);
215 kfree(sdev); 346 kfree(sctx);
216} 347}
217 348
218static noinline_for_stack 349static noinline_for_stack
219struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) 350struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
220{ 351{
221 struct scrub_dev *sdev; 352 struct scrub_ctx *sctx;
222 int i; 353 int i;
223 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 354 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
224 int pages_per_bio; 355 int pages_per_rd_bio;
356 int ret;
225 357
226 pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, 358 /*
227 bio_get_nr_vecs(dev->bdev)); 359 * the setting of pages_per_rd_bio is correct for scrub but might
228 sdev = kzalloc(sizeof(*sdev), GFP_NOFS); 360 * be wrong for the dev_replace code where we might read from
229 if (!sdev) 361 * different devices in the initial huge bios. However, that
362 * code is able to correctly handle the case when adding a page
363 * to a bio fails.
364 */
365 if (dev->bdev)
366 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
367 bio_get_nr_vecs(dev->bdev));
368 else
369 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
370 sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
371 if (!sctx)
230 goto nomem; 372 goto nomem;
231 sdev->dev = dev; 373 sctx->is_dev_replace = is_dev_replace;
232 sdev->pages_per_bio = pages_per_bio; 374 sctx->pages_per_rd_bio = pages_per_rd_bio;
233 sdev->curr = -1; 375 sctx->curr = -1;
234 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 376 sctx->dev_root = dev->dev_root;
377 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
235 struct scrub_bio *sbio; 378 struct scrub_bio *sbio;
236 379
237 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 380 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
238 if (!sbio) 381 if (!sbio)
239 goto nomem; 382 goto nomem;
240 sdev->bios[i] = sbio; 383 sctx->bios[i] = sbio;
241 384
242 sbio->index = i; 385 sbio->index = i;
243 sbio->sdev = sdev; 386 sbio->sctx = sctx;
244 sbio->page_count = 0; 387 sbio->page_count = 0;
245 sbio->work.func = scrub_bio_end_io_worker; 388 sbio->work.func = scrub_bio_end_io_worker;
246 389
247 if (i != SCRUB_BIOS_PER_DEV-1) 390 if (i != SCRUB_BIOS_PER_SCTX - 1)
248 sdev->bios[i]->next_free = i + 1; 391 sctx->bios[i]->next_free = i + 1;
249 else 392 else
250 sdev->bios[i]->next_free = -1; 393 sctx->bios[i]->next_free = -1;
251 } 394 }
252 sdev->first_free = 0; 395 sctx->first_free = 0;
253 sdev->nodesize = dev->dev_root->nodesize; 396 sctx->nodesize = dev->dev_root->nodesize;
254 sdev->leafsize = dev->dev_root->leafsize; 397 sctx->leafsize = dev->dev_root->leafsize;
255 sdev->sectorsize = dev->dev_root->sectorsize; 398 sctx->sectorsize = dev->dev_root->sectorsize;
256 atomic_set(&sdev->in_flight, 0); 399 atomic_set(&sctx->bios_in_flight, 0);
257 atomic_set(&sdev->fixup_cnt, 0); 400 atomic_set(&sctx->workers_pending, 0);
258 atomic_set(&sdev->cancel_req, 0); 401 atomic_set(&sctx->cancel_req, 0);
259 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); 402 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
260 INIT_LIST_HEAD(&sdev->csum_list); 403 INIT_LIST_HEAD(&sctx->csum_list);
261 404
262 spin_lock_init(&sdev->list_lock); 405 spin_lock_init(&sctx->list_lock);
263 spin_lock_init(&sdev->stat_lock); 406 spin_lock_init(&sctx->stat_lock);
264 init_waitqueue_head(&sdev->list_wait); 407 init_waitqueue_head(&sctx->list_wait);
265 return sdev; 408
409 ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
410 fs_info->dev_replace.tgtdev, is_dev_replace);
411 if (ret) {
412 scrub_free_ctx(sctx);
413 return ERR_PTR(ret);
414 }
415 return sctx;
266 416
267nomem: 417nomem:
268 scrub_free_dev(sdev); 418 scrub_free_ctx(sctx);
269 return ERR_PTR(-ENOMEM); 419 return ERR_PTR(-ENOMEM);
270} 420}
271 421
272static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) 422static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
423 void *warn_ctx)
273{ 424{
274 u64 isize; 425 u64 isize;
275 u32 nlink; 426 u32 nlink;
@@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
277 int i; 428 int i;
278 struct extent_buffer *eb; 429 struct extent_buffer *eb;
279 struct btrfs_inode_item *inode_item; 430 struct btrfs_inode_item *inode_item;
280 struct scrub_warning *swarn = ctx; 431 struct scrub_warning *swarn = warn_ctx;
281 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 432 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
282 struct inode_fs_paths *ipath = NULL; 433 struct inode_fs_paths *ipath = NULL;
283 struct btrfs_root *local_root; 434 struct btrfs_root *local_root;
@@ -345,8 +496,8 @@ err:
345 496
346static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 497static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
347{ 498{
348 struct btrfs_device *dev = sblock->sdev->dev; 499 struct btrfs_device *dev;
349 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 500 struct btrfs_fs_info *fs_info;
350 struct btrfs_path *path; 501 struct btrfs_path *path;
351 struct btrfs_key found_key; 502 struct btrfs_key found_key;
352 struct extent_buffer *eb; 503 struct extent_buffer *eb;
@@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
361 const int bufsize = 4096; 512 const int bufsize = 4096;
362 int ret; 513 int ret;
363 514
515 WARN_ON(sblock->page_count < 1);
516 dev = sblock->pagev[0]->dev;
517 fs_info = sblock->sctx->dev_root->fs_info;
518
364 path = btrfs_alloc_path(); 519 path = btrfs_alloc_path();
365 520
366 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); 521 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
367 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); 522 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
368 BUG_ON(sblock->page_count < 1); 523 swarn.sector = (sblock->pagev[0]->physical) >> 9;
369 swarn.sector = (sblock->pagev[0].physical) >> 9; 524 swarn.logical = sblock->pagev[0]->logical;
370 swarn.logical = sblock->pagev[0].logical;
371 swarn.errstr = errstr; 525 swarn.errstr = errstr;
372 swarn.dev = dev; 526 swarn.dev = NULL;
373 swarn.msg_bufsize = bufsize; 527 swarn.msg_bufsize = bufsize;
374 swarn.scratch_bufsize = bufsize; 528 swarn.scratch_bufsize = bufsize;
375 529
@@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
405 } while (ret != 1); 559 } while (ret != 1);
406 } else { 560 } else {
407 swarn.path = path; 561 swarn.path = path;
562 swarn.dev = dev;
408 iterate_extent_inodes(fs_info, found_key.objectid, 563 iterate_extent_inodes(fs_info, found_key.objectid,
409 extent_item_pos, 1, 564 extent_item_pos, 1,
410 scrub_print_warning_inode, &swarn); 565 scrub_print_warning_inode, &swarn);
@@ -416,11 +571,11 @@ out:
416 kfree(swarn.msg_buf); 571 kfree(swarn.msg_buf);
417} 572}
418 573
419static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) 574static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
420{ 575{
421 struct page *page = NULL; 576 struct page *page = NULL;
422 unsigned long index; 577 unsigned long index;
423 struct scrub_fixup_nodatasum *fixup = ctx; 578 struct scrub_fixup_nodatasum *fixup = fixup_ctx;
424 int ret; 579 int ret;
425 int corrected = 0; 580 int corrected = 0;
426 struct btrfs_key key; 581 struct btrfs_key key;
@@ -451,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
451 } 606 }
452 607
453 if (PageUptodate(page)) { 608 if (PageUptodate(page)) {
454 struct btrfs_mapping_tree *map_tree; 609 struct btrfs_fs_info *fs_info;
455 if (PageDirty(page)) { 610 if (PageDirty(page)) {
456 /* 611 /*
457 * we need to write the data to the defect sector. the 612 * we need to write the data to the defect sector. the
@@ -472,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
472 ret = -EIO; 627 ret = -EIO;
473 goto out; 628 goto out;
474 } 629 }
475 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 630 fs_info = BTRFS_I(inode)->root->fs_info;
476 ret = repair_io_failure(map_tree, offset, PAGE_SIZE, 631 ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
477 fixup->logical, page, 632 fixup->logical, page,
478 fixup->mirror_num); 633 fixup->mirror_num);
479 unlock_page(page); 634 unlock_page(page);
@@ -530,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
530{ 685{
531 int ret; 686 int ret;
532 struct scrub_fixup_nodatasum *fixup; 687 struct scrub_fixup_nodatasum *fixup;
533 struct scrub_dev *sdev; 688 struct scrub_ctx *sctx;
534 struct btrfs_trans_handle *trans = NULL; 689 struct btrfs_trans_handle *trans = NULL;
535 struct btrfs_fs_info *fs_info; 690 struct btrfs_fs_info *fs_info;
536 struct btrfs_path *path; 691 struct btrfs_path *path;
537 int uncorrectable = 0; 692 int uncorrectable = 0;
538 693
539 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 694 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
540 sdev = fixup->sdev; 695 sctx = fixup->sctx;
541 fs_info = fixup->root->fs_info; 696 fs_info = fixup->root->fs_info;
542 697
543 path = btrfs_alloc_path(); 698 path = btrfs_alloc_path();
544 if (!path) { 699 if (!path) {
545 spin_lock(&sdev->stat_lock); 700 spin_lock(&sctx->stat_lock);
546 ++sdev->stat.malloc_errors; 701 ++sctx->stat.malloc_errors;
547 spin_unlock(&sdev->stat_lock); 702 spin_unlock(&sctx->stat_lock);
548 uncorrectable = 1; 703 uncorrectable = 1;
549 goto out; 704 goto out;
550 } 705 }
@@ -573,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
573 } 728 }
574 WARN_ON(ret != 1); 729 WARN_ON(ret != 1);
575 730
576 spin_lock(&sdev->stat_lock); 731 spin_lock(&sctx->stat_lock);
577 ++sdev->stat.corrected_errors; 732 ++sctx->stat.corrected_errors;
578 spin_unlock(&sdev->stat_lock); 733 spin_unlock(&sctx->stat_lock);
579 734
580out: 735out:
581 if (trans && !IS_ERR(trans)) 736 if (trans && !IS_ERR(trans))
582 btrfs_end_transaction(trans, fixup->root); 737 btrfs_end_transaction(trans, fixup->root);
583 if (uncorrectable) { 738 if (uncorrectable) {
584 spin_lock(&sdev->stat_lock); 739 spin_lock(&sctx->stat_lock);
585 ++sdev->stat.uncorrectable_errors; 740 ++sctx->stat.uncorrectable_errors;
586 spin_unlock(&sdev->stat_lock); 741 spin_unlock(&sctx->stat_lock);
587 742 btrfs_dev_replace_stats_inc(
743 &sctx->dev_root->fs_info->dev_replace.
744 num_uncorrectable_read_errors);
588 printk_ratelimited_in_rcu(KERN_ERR 745 printk_ratelimited_in_rcu(KERN_ERR
589 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 746 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
590 (unsigned long long)fixup->logical, 747 (unsigned long long)fixup->logical,
591 rcu_str_deref(sdev->dev->name)); 748 rcu_str_deref(fixup->dev->name));
592 } 749 }
593 750
594 btrfs_free_path(path); 751 btrfs_free_path(path);
595 kfree(fixup); 752 kfree(fixup);
596 753
597 /* see caller why we're pretending to be paused in the scrub counters */ 754 scrub_pending_trans_workers_dec(sctx);
598 mutex_lock(&fs_info->scrub_lock);
599 atomic_dec(&fs_info->scrubs_running);
600 atomic_dec(&fs_info->scrubs_paused);
601 mutex_unlock(&fs_info->scrub_lock);
602 atomic_dec(&sdev->fixup_cnt);
603 wake_up(&fs_info->scrub_pause_wait);
604 wake_up(&sdev->list_wait);
605} 755}
606 756
607/* 757/*
@@ -614,7 +764,8 @@ out:
614 */ 764 */
615static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 765static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
616{ 766{
617 struct scrub_dev *sdev = sblock_to_check->sdev; 767 struct scrub_ctx *sctx = sblock_to_check->sctx;
768 struct btrfs_device *dev;
618 struct btrfs_fs_info *fs_info; 769 struct btrfs_fs_info *fs_info;
619 u64 length; 770 u64 length;
620 u64 logical; 771 u64 logical;
@@ -633,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
633 DEFAULT_RATELIMIT_BURST); 784 DEFAULT_RATELIMIT_BURST);
634 785
635 BUG_ON(sblock_to_check->page_count < 1); 786 BUG_ON(sblock_to_check->page_count < 1);
636 fs_info = sdev->dev->dev_root->fs_info; 787 fs_info = sctx->dev_root->fs_info;
788 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
789 /*
790 * if we find an error in a super block, we just report it.
791 * They will get written with the next transaction commit
792 * anyway
793 */
794 spin_lock(&sctx->stat_lock);
795 ++sctx->stat.super_errors;
796 spin_unlock(&sctx->stat_lock);
797 return 0;
798 }
637 length = sblock_to_check->page_count * PAGE_SIZE; 799 length = sblock_to_check->page_count * PAGE_SIZE;
638 logical = sblock_to_check->pagev[0].logical; 800 logical = sblock_to_check->pagev[0]->logical;
639 generation = sblock_to_check->pagev[0].generation; 801 generation = sblock_to_check->pagev[0]->generation;
640 BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); 802 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
641 failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; 803 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
642 is_metadata = !(sblock_to_check->pagev[0].flags & 804 is_metadata = !(sblock_to_check->pagev[0]->flags &
643 BTRFS_EXTENT_FLAG_DATA); 805 BTRFS_EXTENT_FLAG_DATA);
644 have_csum = sblock_to_check->pagev[0].have_csum; 806 have_csum = sblock_to_check->pagev[0]->have_csum;
645 csum = sblock_to_check->pagev[0].csum; 807 csum = sblock_to_check->pagev[0]->csum;
808 dev = sblock_to_check->pagev[0]->dev;
809
810 if (sctx->is_dev_replace && !is_metadata && !have_csum) {
811 sblocks_for_recheck = NULL;
812 goto nodatasum_case;
813 }
646 814
647 /* 815 /*
648 * read all mirrors one after the other. This includes to 816 * read all mirrors one after the other. This includes to
@@ -677,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
677 sizeof(*sblocks_for_recheck), 845 sizeof(*sblocks_for_recheck),
678 GFP_NOFS); 846 GFP_NOFS);
679 if (!sblocks_for_recheck) { 847 if (!sblocks_for_recheck) {
680 spin_lock(&sdev->stat_lock); 848 spin_lock(&sctx->stat_lock);
681 sdev->stat.malloc_errors++; 849 sctx->stat.malloc_errors++;
682 sdev->stat.read_errors++; 850 sctx->stat.read_errors++;
683 sdev->stat.uncorrectable_errors++; 851 sctx->stat.uncorrectable_errors++;
684 spin_unlock(&sdev->stat_lock); 852 spin_unlock(&sctx->stat_lock);
685 btrfs_dev_stat_inc_and_print(sdev->dev, 853 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
686 BTRFS_DEV_STAT_READ_ERRS);
687 goto out; 854 goto out;
688 } 855 }
689 856
690 /* setup the context, map the logical blocks and alloc the pages */ 857 /* setup the context, map the logical blocks and alloc the pages */
691 ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, 858 ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
692 logical, sblocks_for_recheck); 859 logical, sblocks_for_recheck);
693 if (ret) { 860 if (ret) {
694 spin_lock(&sdev->stat_lock); 861 spin_lock(&sctx->stat_lock);
695 sdev->stat.read_errors++; 862 sctx->stat.read_errors++;
696 sdev->stat.uncorrectable_errors++; 863 sctx->stat.uncorrectable_errors++;
697 spin_unlock(&sdev->stat_lock); 864 spin_unlock(&sctx->stat_lock);
698 btrfs_dev_stat_inc_and_print(sdev->dev, 865 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
699 BTRFS_DEV_STAT_READ_ERRS);
700 goto out; 866 goto out;
701 } 867 }
702 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 868 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
703 sblock_bad = sblocks_for_recheck + failed_mirror_index; 869 sblock_bad = sblocks_for_recheck + failed_mirror_index;
704 870
705 /* build and submit the bios for the failed mirror, check checksums */ 871 /* build and submit the bios for the failed mirror, check checksums */
706 ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 872 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
707 csum, generation, sdev->csum_size); 873 csum, generation, sctx->csum_size);
708 if (ret) {
709 spin_lock(&sdev->stat_lock);
710 sdev->stat.read_errors++;
711 sdev->stat.uncorrectable_errors++;
712 spin_unlock(&sdev->stat_lock);
713 btrfs_dev_stat_inc_and_print(sdev->dev,
714 BTRFS_DEV_STAT_READ_ERRS);
715 goto out;
716 }
717 874
718 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 875 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
719 sblock_bad->no_io_error_seen) { 876 sblock_bad->no_io_error_seen) {
@@ -725,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
725 * different bio (usually one of the two latter cases is 882 * different bio (usually one of the two latter cases is
726 * the cause) 883 * the cause)
727 */ 884 */
728 spin_lock(&sdev->stat_lock); 885 spin_lock(&sctx->stat_lock);
729 sdev->stat.unverified_errors++; 886 sctx->stat.unverified_errors++;
730 spin_unlock(&sdev->stat_lock); 887 spin_unlock(&sctx->stat_lock);
731 888
889 if (sctx->is_dev_replace)
890 scrub_write_block_to_dev_replace(sblock_bad);
732 goto out; 891 goto out;
733 } 892 }
734 893
735 if (!sblock_bad->no_io_error_seen) { 894 if (!sblock_bad->no_io_error_seen) {
736 spin_lock(&sdev->stat_lock); 895 spin_lock(&sctx->stat_lock);
737 sdev->stat.read_errors++; 896 sctx->stat.read_errors++;
738 spin_unlock(&sdev->stat_lock); 897 spin_unlock(&sctx->stat_lock);
739 if (__ratelimit(&_rs)) 898 if (__ratelimit(&_rs))
740 scrub_print_warning("i/o error", sblock_to_check); 899 scrub_print_warning("i/o error", sblock_to_check);
741 btrfs_dev_stat_inc_and_print(sdev->dev, 900 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
742 BTRFS_DEV_STAT_READ_ERRS);
743 } else if (sblock_bad->checksum_error) { 901 } else if (sblock_bad->checksum_error) {
744 spin_lock(&sdev->stat_lock); 902 spin_lock(&sctx->stat_lock);
745 sdev->stat.csum_errors++; 903 sctx->stat.csum_errors++;
746 spin_unlock(&sdev->stat_lock); 904 spin_unlock(&sctx->stat_lock);
747 if (__ratelimit(&_rs)) 905 if (__ratelimit(&_rs))
748 scrub_print_warning("checksum error", sblock_to_check); 906 scrub_print_warning("checksum error", sblock_to_check);
749 btrfs_dev_stat_inc_and_print(sdev->dev, 907 btrfs_dev_stat_inc_and_print(dev,
750 BTRFS_DEV_STAT_CORRUPTION_ERRS); 908 BTRFS_DEV_STAT_CORRUPTION_ERRS);
751 } else if (sblock_bad->header_error) { 909 } else if (sblock_bad->header_error) {
752 spin_lock(&sdev->stat_lock); 910 spin_lock(&sctx->stat_lock);
753 sdev->stat.verify_errors++; 911 sctx->stat.verify_errors++;
754 spin_unlock(&sdev->stat_lock); 912 spin_unlock(&sctx->stat_lock);
755 if (__ratelimit(&_rs)) 913 if (__ratelimit(&_rs))
756 scrub_print_warning("checksum/header error", 914 scrub_print_warning("checksum/header error",
757 sblock_to_check); 915 sblock_to_check);
758 if (sblock_bad->generation_error) 916 if (sblock_bad->generation_error)
759 btrfs_dev_stat_inc_and_print(sdev->dev, 917 btrfs_dev_stat_inc_and_print(dev,
760 BTRFS_DEV_STAT_GENERATION_ERRS); 918 BTRFS_DEV_STAT_GENERATION_ERRS);
761 else 919 else
762 btrfs_dev_stat_inc_and_print(sdev->dev, 920 btrfs_dev_stat_inc_and_print(dev,
763 BTRFS_DEV_STAT_CORRUPTION_ERRS); 921 BTRFS_DEV_STAT_CORRUPTION_ERRS);
764 } 922 }
765 923
766 if (sdev->readonly) 924 if (sctx->readonly && !sctx->is_dev_replace)
767 goto did_not_correct_error; 925 goto did_not_correct_error;
768 926
769 if (!is_metadata && !have_csum) { 927 if (!is_metadata && !have_csum) {
770 struct scrub_fixup_nodatasum *fixup_nodatasum; 928 struct scrub_fixup_nodatasum *fixup_nodatasum;
771 929
930nodatasum_case:
931 WARN_ON(sctx->is_dev_replace);
932
772 /* 933 /*
773 * !is_metadata and !have_csum, this means that the data 934 * !is_metadata and !have_csum, this means that the data
774 * might not be COW'ed, that it might be modified 935 * might not be COW'ed, that it might be modified
@@ -779,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
779 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 940 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
780 if (!fixup_nodatasum) 941 if (!fixup_nodatasum)
781 goto did_not_correct_error; 942 goto did_not_correct_error;
782 fixup_nodatasum->sdev = sdev; 943 fixup_nodatasum->sctx = sctx;
944 fixup_nodatasum->dev = dev;
783 fixup_nodatasum->logical = logical; 945 fixup_nodatasum->logical = logical;
784 fixup_nodatasum->root = fs_info->extent_root; 946 fixup_nodatasum->root = fs_info->extent_root;
785 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 947 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
786 /* 948 scrub_pending_trans_workers_inc(sctx);
787 * increment scrubs_running to prevent cancel requests from
788 * completing as long as a fixup worker is running. we must also
789 * increment scrubs_paused to prevent deadlocking on pause
790 * requests used for transactions commits (as the worker uses a
791 * transaction context). it is safe to regard the fixup worker
792 * as paused for all matters practical. effectively, we only
793 * avoid cancellation requests from completing.
794 */
795 mutex_lock(&fs_info->scrub_lock);
796 atomic_inc(&fs_info->scrubs_running);
797 atomic_inc(&fs_info->scrubs_paused);
798 mutex_unlock(&fs_info->scrub_lock);
799 atomic_inc(&sdev->fixup_cnt);
800 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 949 fixup_nodatasum->work.func = scrub_fixup_nodatasum;
801 btrfs_queue_worker(&fs_info->scrub_workers, 950 btrfs_queue_worker(&fs_info->scrub_workers,
802 &fixup_nodatasum->work); 951 &fixup_nodatasum->work);
@@ -805,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
805 954
806 /* 955 /*
807 * now build and submit the bios for the other mirrors, check 956 * now build and submit the bios for the other mirrors, check
808 * checksums 957 * checksums.
809 */ 958 * First try to pick the mirror which is completely without I/O
810 for (mirror_index = 0;
811 mirror_index < BTRFS_MAX_MIRRORS &&
812 sblocks_for_recheck[mirror_index].page_count > 0;
813 mirror_index++) {
814 if (mirror_index == failed_mirror_index)
815 continue;
816
817 /* build and submit the bios, check checksums */
818 ret = scrub_recheck_block(fs_info,
819 sblocks_for_recheck + mirror_index,
820 is_metadata, have_csum, csum,
821 generation, sdev->csum_size);
822 if (ret)
823 goto did_not_correct_error;
824 }
825
826 /*
827 * first try to pick the mirror which is completely without I/O
828 * errors and also does not have a checksum error. 959 * errors and also does not have a checksum error.
829 * If one is found, and if a checksum is present, the full block 960 * If one is found, and if a checksum is present, the full block
830 * that is known to contain an error is rewritten. Afterwards 961 * that is known to contain an error is rewritten. Afterwards
@@ -840,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
840 mirror_index < BTRFS_MAX_MIRRORS && 971 mirror_index < BTRFS_MAX_MIRRORS &&
841 sblocks_for_recheck[mirror_index].page_count > 0; 972 sblocks_for_recheck[mirror_index].page_count > 0;
842 mirror_index++) { 973 mirror_index++) {
843 struct scrub_block *sblock_other = sblocks_for_recheck + 974 struct scrub_block *sblock_other;
844 mirror_index; 975
976 if (mirror_index == failed_mirror_index)
977 continue;
978 sblock_other = sblocks_for_recheck + mirror_index;
979
980 /* build and submit the bios, check checksums */
981 scrub_recheck_block(fs_info, sblock_other, is_metadata,
982 have_csum, csum, generation,
983 sctx->csum_size);
845 984
846 if (!sblock_other->header_error && 985 if (!sblock_other->header_error &&
847 !sblock_other->checksum_error && 986 !sblock_other->checksum_error &&
848 sblock_other->no_io_error_seen) { 987 sblock_other->no_io_error_seen) {
849 int force_write = is_metadata || have_csum; 988 if (sctx->is_dev_replace) {
850 989 scrub_write_block_to_dev_replace(sblock_other);
851 ret = scrub_repair_block_from_good_copy(sblock_bad, 990 } else {
852 sblock_other, 991 int force_write = is_metadata || have_csum;
853 force_write); 992
993 ret = scrub_repair_block_from_good_copy(
994 sblock_bad, sblock_other,
995 force_write);
996 }
854 if (0 == ret) 997 if (0 == ret)
855 goto corrected_error; 998 goto corrected_error;
856 } 999 }
857 } 1000 }
858 1001
859 /* 1002 /*
860 * in case of I/O errors in the area that is supposed to be 1003 * for dev_replace, pick good pages and write to the target device.
1004 */
1005 if (sctx->is_dev_replace) {
1006 success = 1;
1007 for (page_num = 0; page_num < sblock_bad->page_count;
1008 page_num++) {
1009 int sub_success;
1010
1011 sub_success = 0;
1012 for (mirror_index = 0;
1013 mirror_index < BTRFS_MAX_MIRRORS &&
1014 sblocks_for_recheck[mirror_index].page_count > 0;
1015 mirror_index++) {
1016 struct scrub_block *sblock_other =
1017 sblocks_for_recheck + mirror_index;
1018 struct scrub_page *page_other =
1019 sblock_other->pagev[page_num];
1020
1021 if (!page_other->io_error) {
1022 ret = scrub_write_page_to_dev_replace(
1023 sblock_other, page_num);
1024 if (ret == 0) {
1025 /* succeeded for this page */
1026 sub_success = 1;
1027 break;
1028 } else {
1029 btrfs_dev_replace_stats_inc(
1030 &sctx->dev_root->
1031 fs_info->dev_replace.
1032 num_write_errors);
1033 }
1034 }
1035 }
1036
1037 if (!sub_success) {
1038 /*
1039 * did not find a mirror to fetch the page
1040 * from. scrub_write_page_to_dev_replace()
1041 * handles this case (page->io_error), by
1042 * filling the block with zeros before
1043 * submitting the write request
1044 */
1045 success = 0;
1046 ret = scrub_write_page_to_dev_replace(
1047 sblock_bad, page_num);
1048 if (ret)
1049 btrfs_dev_replace_stats_inc(
1050 &sctx->dev_root->fs_info->
1051 dev_replace.num_write_errors);
1052 }
1053 }
1054
1055 goto out;
1056 }
1057
1058 /*
1059 * for regular scrub, repair those pages that are errored.
1060 * In case of I/O errors in the area that is supposed to be
861 * repaired, continue by picking good copies of those pages. 1061 * repaired, continue by picking good copies of those pages.
862 * Select the good pages from mirrors to rewrite bad pages from 1062 * Select the good pages from mirrors to rewrite bad pages from
863 * the area to fix. Afterwards verify the checksum of the block 1063 * the area to fix. Afterwards verify the checksum of the block
@@ -887,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
887 1087
888 success = 1; 1088 success = 1;
889 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1089 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
890 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1090 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
891 1091
892 if (!page_bad->io_error) 1092 if (!page_bad->io_error)
893 continue; 1093 continue;
@@ -898,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
898 mirror_index++) { 1098 mirror_index++) {
899 struct scrub_block *sblock_other = sblocks_for_recheck + 1099 struct scrub_block *sblock_other = sblocks_for_recheck +
900 mirror_index; 1100 mirror_index;
901 struct scrub_page *page_other = sblock_other->pagev + 1101 struct scrub_page *page_other = sblock_other->pagev[
902 page_num; 1102 page_num];
903 1103
904 if (!page_other->io_error) { 1104 if (!page_other->io_error) {
905 ret = scrub_repair_page_from_good_copy( 1105 ret = scrub_repair_page_from_good_copy(
@@ -928,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
928 * is verified, but most likely the data comes out 1128 * is verified, but most likely the data comes out
929 * of the page cache. 1129 * of the page cache.
930 */ 1130 */
931 ret = scrub_recheck_block(fs_info, sblock_bad, 1131 scrub_recheck_block(fs_info, sblock_bad,
932 is_metadata, have_csum, csum, 1132 is_metadata, have_csum, csum,
933 generation, sdev->csum_size); 1133 generation, sctx->csum_size);
934 if (!ret && !sblock_bad->header_error && 1134 if (!sblock_bad->header_error &&
935 !sblock_bad->checksum_error && 1135 !sblock_bad->checksum_error &&
936 sblock_bad->no_io_error_seen) 1136 sblock_bad->no_io_error_seen)
937 goto corrected_error; 1137 goto corrected_error;
@@ -939,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
939 goto did_not_correct_error; 1139 goto did_not_correct_error;
940 } else { 1140 } else {
941corrected_error: 1141corrected_error:
942 spin_lock(&sdev->stat_lock); 1142 spin_lock(&sctx->stat_lock);
943 sdev->stat.corrected_errors++; 1143 sctx->stat.corrected_errors++;
944 spin_unlock(&sdev->stat_lock); 1144 spin_unlock(&sctx->stat_lock);
945 printk_ratelimited_in_rcu(KERN_ERR 1145 printk_ratelimited_in_rcu(KERN_ERR
946 "btrfs: fixed up error at logical %llu on dev %s\n", 1146 "btrfs: fixed up error at logical %llu on dev %s\n",
947 (unsigned long long)logical, 1147 (unsigned long long)logical,
948 rcu_str_deref(sdev->dev->name)); 1148 rcu_str_deref(dev->name));
949 } 1149 }
950 } else { 1150 } else {
951did_not_correct_error: 1151did_not_correct_error:
952 spin_lock(&sdev->stat_lock); 1152 spin_lock(&sctx->stat_lock);
953 sdev->stat.uncorrectable_errors++; 1153 sctx->stat.uncorrectable_errors++;
954 spin_unlock(&sdev->stat_lock); 1154 spin_unlock(&sctx->stat_lock);
955 printk_ratelimited_in_rcu(KERN_ERR 1155 printk_ratelimited_in_rcu(KERN_ERR
956 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", 1156 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
957 (unsigned long long)logical, 1157 (unsigned long long)logical,
958 rcu_str_deref(sdev->dev->name)); 1158 rcu_str_deref(dev->name));
959 } 1159 }
960 1160
961out: 1161out:
@@ -966,11 +1166,11 @@ out:
966 mirror_index; 1166 mirror_index;
967 int page_index; 1167 int page_index;
968 1168
969 for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; 1169 for (page_index = 0; page_index < sblock->page_count;
970 page_index++) 1170 page_index++) {
971 if (sblock->pagev[page_index].page) 1171 sblock->pagev[page_index]->sblock = NULL;
972 __free_page( 1172 scrub_page_put(sblock->pagev[page_index]);
973 sblock->pagev[page_index].page); 1173 }
974 } 1174 }
975 kfree(sblocks_for_recheck); 1175 kfree(sblocks_for_recheck);
976 } 1176 }
@@ -978,8 +1178,9 @@ out:
978 return 0; 1178 return 0;
979} 1179}
980 1180
981static int scrub_setup_recheck_block(struct scrub_dev *sdev, 1181static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
982 struct btrfs_mapping_tree *map_tree, 1182 struct btrfs_fs_info *fs_info,
1183 struct scrub_block *original_sblock,
983 u64 length, u64 logical, 1184 u64 length, u64 logical,
984 struct scrub_block *sblocks_for_recheck) 1185 struct scrub_block *sblocks_for_recheck)
985{ 1186{
@@ -988,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
988 int ret; 1189 int ret;
989 1190
990 /* 1191 /*
991 * note: the three members sdev, ref_count and outstanding_pages 1192 * note: the two members ref_count and outstanding_pages
992 * are not used (and not set) in the blocks that are used for 1193 * are not used (and not set) in the blocks that are used for
993 * the recheck procedure 1194 * the recheck procedure
994 */ 1195 */
@@ -1003,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1003 * with a length of PAGE_SIZE, each returned stripe 1204 * with a length of PAGE_SIZE, each returned stripe
1004 * represents one mirror 1205 * represents one mirror
1005 */ 1206 */
1006 ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, 1207 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
1007 &bbio, 0); 1208 &mapped_length, &bbio, 0);
1008 if (ret || !bbio || mapped_length < sublen) { 1209 if (ret || !bbio || mapped_length < sublen) {
1009 kfree(bbio); 1210 kfree(bbio);
1010 return -EIO; 1211 return -EIO;
1011 } 1212 }
1012 1213
1013 BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); 1214 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1014 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1215 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1015 mirror_index++) { 1216 mirror_index++) {
1016 struct scrub_block *sblock; 1217 struct scrub_block *sblock;
@@ -1020,21 +1221,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1020 continue; 1221 continue;
1021 1222
1022 sblock = sblocks_for_recheck + mirror_index; 1223 sblock = sblocks_for_recheck + mirror_index;
1023 page = sblock->pagev + page_index; 1224 sblock->sctx = sctx;
1225 page = kzalloc(sizeof(*page), GFP_NOFS);
1226 if (!page) {
1227leave_nomem:
1228 spin_lock(&sctx->stat_lock);
1229 sctx->stat.malloc_errors++;
1230 spin_unlock(&sctx->stat_lock);
1231 kfree(bbio);
1232 return -ENOMEM;
1233 }
1234 scrub_page_get(page);
1235 sblock->pagev[page_index] = page;
1024 page->logical = logical; 1236 page->logical = logical;
1025 page->physical = bbio->stripes[mirror_index].physical; 1237 page->physical = bbio->stripes[mirror_index].physical;
1238 BUG_ON(page_index >= original_sblock->page_count);
1239 page->physical_for_dev_replace =
1240 original_sblock->pagev[page_index]->
1241 physical_for_dev_replace;
1026 /* for missing devices, dev->bdev is NULL */ 1242 /* for missing devices, dev->bdev is NULL */
1027 page->dev = bbio->stripes[mirror_index].dev; 1243 page->dev = bbio->stripes[mirror_index].dev;
1028 page->mirror_num = mirror_index + 1; 1244 page->mirror_num = mirror_index + 1;
1029 page->page = alloc_page(GFP_NOFS);
1030 if (!page->page) {
1031 spin_lock(&sdev->stat_lock);
1032 sdev->stat.malloc_errors++;
1033 spin_unlock(&sdev->stat_lock);
1034 kfree(bbio);
1035 return -ENOMEM;
1036 }
1037 sblock->page_count++; 1245 sblock->page_count++;
1246 page->page = alloc_page(GFP_NOFS);
1247 if (!page->page)
1248 goto leave_nomem;
1038 } 1249 }
1039 kfree(bbio); 1250 kfree(bbio);
1040 length -= sublen; 1251 length -= sublen;
@@ -1052,10 +1263,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1052 * to take those pages that are not errored from all the mirrors so that 1263 * to take those pages that are not errored from all the mirrors so that
1053 * the pages that are errored in the just handled mirror can be repaired. 1264 * the pages that are errored in the just handled mirror can be repaired.
1054 */ 1265 */
1055static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 1266static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1056 struct scrub_block *sblock, int is_metadata, 1267 struct scrub_block *sblock, int is_metadata,
1057 int have_csum, u8 *csum, u64 generation, 1268 int have_csum, u8 *csum, u64 generation,
1058 u16 csum_size) 1269 u16 csum_size)
1059{ 1270{
1060 int page_num; 1271 int page_num;
1061 1272
@@ -1065,8 +1276,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1065 1276
1066 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1277 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1067 struct bio *bio; 1278 struct bio *bio;
1068 int ret; 1279 struct scrub_page *page = sblock->pagev[page_num];
1069 struct scrub_page *page = sblock->pagev + page_num;
1070 DECLARE_COMPLETION_ONSTACK(complete); 1280 DECLARE_COMPLETION_ONSTACK(complete);
1071 1281
1072 if (page->dev->bdev == NULL) { 1282 if (page->dev->bdev == NULL) {
@@ -1075,20 +1285,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1075 continue; 1285 continue;
1076 } 1286 }
1077 1287
1078 BUG_ON(!page->page); 1288 WARN_ON(!page->page);
1079 bio = bio_alloc(GFP_NOFS, 1); 1289 bio = bio_alloc(GFP_NOFS, 1);
1080 if (!bio) 1290 if (!bio) {
1081 return -EIO; 1291 page->io_error = 1;
1292 sblock->no_io_error_seen = 0;
1293 continue;
1294 }
1082 bio->bi_bdev = page->dev->bdev; 1295 bio->bi_bdev = page->dev->bdev;
1083 bio->bi_sector = page->physical >> 9; 1296 bio->bi_sector = page->physical >> 9;
1084 bio->bi_end_io = scrub_complete_bio_end_io; 1297 bio->bi_end_io = scrub_complete_bio_end_io;
1085 bio->bi_private = &complete; 1298 bio->bi_private = &complete;
1086 1299
1087 ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); 1300 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1088 if (PAGE_SIZE != ret) {
1089 bio_put(bio);
1090 return -EIO;
1091 }
1092 btrfsic_submit_bio(READ, bio); 1301 btrfsic_submit_bio(READ, bio);
1093 1302
1094 /* this will also unplug the queue */ 1303 /* this will also unplug the queue */
@@ -1105,7 +1314,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1105 have_csum, csum, generation, 1314 have_csum, csum, generation,
1106 csum_size); 1315 csum_size);
1107 1316
1108 return 0; 1317 return;
1109} 1318}
1110 1319
1111static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 1320static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
@@ -1120,14 +1329,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1120 struct btrfs_root *root = fs_info->extent_root; 1329 struct btrfs_root *root = fs_info->extent_root;
1121 void *mapped_buffer; 1330 void *mapped_buffer;
1122 1331
1123 BUG_ON(!sblock->pagev[0].page); 1332 WARN_ON(!sblock->pagev[0]->page);
1124 if (is_metadata) { 1333 if (is_metadata) {
1125 struct btrfs_header *h; 1334 struct btrfs_header *h;
1126 1335
1127 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1336 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1128 h = (struct btrfs_header *)mapped_buffer; 1337 h = (struct btrfs_header *)mapped_buffer;
1129 1338
1130 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1339 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
1131 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1340 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1132 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1341 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1133 BTRFS_UUID_SIZE)) { 1342 BTRFS_UUID_SIZE)) {
@@ -1141,7 +1350,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1141 if (!have_csum) 1350 if (!have_csum)
1142 return; 1351 return;
1143 1352
1144 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1353 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1145 } 1354 }
1146 1355
1147 for (page_num = 0;;) { 1356 for (page_num = 0;;) {
@@ -1157,9 +1366,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1157 page_num++; 1366 page_num++;
1158 if (page_num >= sblock->page_count) 1367 if (page_num >= sblock->page_count)
1159 break; 1368 break;
1160 BUG_ON(!sblock->pagev[page_num].page); 1369 WARN_ON(!sblock->pagev[page_num]->page);
1161 1370
1162 mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); 1371 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1163 } 1372 }
1164 1373
1165 btrfs_csum_final(crc, calculated_csum); 1374 btrfs_csum_final(crc, calculated_csum);
@@ -1197,17 +1406,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1197 struct scrub_block *sblock_good, 1406 struct scrub_block *sblock_good,
1198 int page_num, int force_write) 1407 int page_num, int force_write)
1199{ 1408{
1200 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1409 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1201 struct scrub_page *page_good = sblock_good->pagev + page_num; 1410 struct scrub_page *page_good = sblock_good->pagev[page_num];
1202 1411
1203 BUG_ON(sblock_bad->pagev[page_num].page == NULL); 1412 BUG_ON(page_bad->page == NULL);
1204 BUG_ON(sblock_good->pagev[page_num].page == NULL); 1413 BUG_ON(page_good->page == NULL);
1205 if (force_write || sblock_bad->header_error || 1414 if (force_write || sblock_bad->header_error ||
1206 sblock_bad->checksum_error || page_bad->io_error) { 1415 sblock_bad->checksum_error || page_bad->io_error) {
1207 struct bio *bio; 1416 struct bio *bio;
1208 int ret; 1417 int ret;
1209 DECLARE_COMPLETION_ONSTACK(complete); 1418 DECLARE_COMPLETION_ONSTACK(complete);
1210 1419
1420 if (!page_bad->dev->bdev) {
1421 printk_ratelimited(KERN_WARNING
1422 "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1423 return -EIO;
1424 }
1425
1211 bio = bio_alloc(GFP_NOFS, 1); 1426 bio = bio_alloc(GFP_NOFS, 1);
1212 if (!bio) 1427 if (!bio)
1213 return -EIO; 1428 return -EIO;
@@ -1228,6 +1443,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1228 if (!bio_flagged(bio, BIO_UPTODATE)) { 1443 if (!bio_flagged(bio, BIO_UPTODATE)) {
1229 btrfs_dev_stat_inc_and_print(page_bad->dev, 1444 btrfs_dev_stat_inc_and_print(page_bad->dev,
1230 BTRFS_DEV_STAT_WRITE_ERRS); 1445 BTRFS_DEV_STAT_WRITE_ERRS);
1446 btrfs_dev_replace_stats_inc(
1447 &sblock_bad->sctx->dev_root->fs_info->
1448 dev_replace.num_write_errors);
1231 bio_put(bio); 1449 bio_put(bio);
1232 return -EIO; 1450 return -EIO;
1233 } 1451 }
@@ -1237,13 +1455,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1237 return 0; 1455 return 0;
1238} 1456}
1239 1457
1240static void scrub_checksum(struct scrub_block *sblock) 1458static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1459{
1460 int page_num;
1461
1462 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1463 int ret;
1464
1465 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1466 if (ret)
1467 btrfs_dev_replace_stats_inc(
1468 &sblock->sctx->dev_root->fs_info->dev_replace.
1469 num_write_errors);
1470 }
1471}
1472
1473static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1474 int page_num)
1475{
1476 struct scrub_page *spage = sblock->pagev[page_num];
1477
1478 BUG_ON(spage->page == NULL);
1479 if (spage->io_error) {
1480 void *mapped_buffer = kmap_atomic(spage->page);
1481
1482 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1483 flush_dcache_page(spage->page);
1484 kunmap_atomic(mapped_buffer);
1485 }
1486 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1487}
1488
1489static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1490 struct scrub_page *spage)
1491{
1492 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1493 struct scrub_bio *sbio;
1494 int ret;
1495
1496 mutex_lock(&wr_ctx->wr_lock);
1497again:
1498 if (!wr_ctx->wr_curr_bio) {
1499 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1500 GFP_NOFS);
1501 if (!wr_ctx->wr_curr_bio) {
1502 mutex_unlock(&wr_ctx->wr_lock);
1503 return -ENOMEM;
1504 }
1505 wr_ctx->wr_curr_bio->sctx = sctx;
1506 wr_ctx->wr_curr_bio->page_count = 0;
1507 }
1508 sbio = wr_ctx->wr_curr_bio;
1509 if (sbio->page_count == 0) {
1510 struct bio *bio;
1511
1512 sbio->physical = spage->physical_for_dev_replace;
1513 sbio->logical = spage->logical;
1514 sbio->dev = wr_ctx->tgtdev;
1515 bio = sbio->bio;
1516 if (!bio) {
1517 bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1518 if (!bio) {
1519 mutex_unlock(&wr_ctx->wr_lock);
1520 return -ENOMEM;
1521 }
1522 sbio->bio = bio;
1523 }
1524
1525 bio->bi_private = sbio;
1526 bio->bi_end_io = scrub_wr_bio_end_io;
1527 bio->bi_bdev = sbio->dev->bdev;
1528 bio->bi_sector = sbio->physical >> 9;
1529 sbio->err = 0;
1530 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1531 spage->physical_for_dev_replace ||
1532 sbio->logical + sbio->page_count * PAGE_SIZE !=
1533 spage->logical) {
1534 scrub_wr_submit(sctx);
1535 goto again;
1536 }
1537
1538 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1539 if (ret != PAGE_SIZE) {
1540 if (sbio->page_count < 1) {
1541 bio_put(sbio->bio);
1542 sbio->bio = NULL;
1543 mutex_unlock(&wr_ctx->wr_lock);
1544 return -EIO;
1545 }
1546 scrub_wr_submit(sctx);
1547 goto again;
1548 }
1549
1550 sbio->pagev[sbio->page_count] = spage;
1551 scrub_page_get(spage);
1552 sbio->page_count++;
1553 if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1554 scrub_wr_submit(sctx);
1555 mutex_unlock(&wr_ctx->wr_lock);
1556
1557 return 0;
1558}
1559
1560static void scrub_wr_submit(struct scrub_ctx *sctx)
1561{
1562 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1563 struct scrub_bio *sbio;
1564
1565 if (!wr_ctx->wr_curr_bio)
1566 return;
1567
1568 sbio = wr_ctx->wr_curr_bio;
1569 wr_ctx->wr_curr_bio = NULL;
1570 WARN_ON(!sbio->bio->bi_bdev);
1571 scrub_pending_bio_inc(sctx);
1572 /* process all writes in a single worker thread. Then the block layer
1573 * orders the requests before sending them to the driver which
1574 * doubled the write performance on spinning disks when measured
1575 * with Linux 3.5 */
1576 btrfsic_submit_bio(WRITE, sbio->bio);
1577}
1578
1579static void scrub_wr_bio_end_io(struct bio *bio, int err)
1580{
1581 struct scrub_bio *sbio = bio->bi_private;
1582 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1583
1584 sbio->err = err;
1585 sbio->bio = bio;
1586
1587 sbio->work.func = scrub_wr_bio_end_io_worker;
1588 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1589}
1590
1591static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1592{
1593 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1594 struct scrub_ctx *sctx = sbio->sctx;
1595 int i;
1596
1597 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1598 if (sbio->err) {
1599 struct btrfs_dev_replace *dev_replace =
1600 &sbio->sctx->dev_root->fs_info->dev_replace;
1601
1602 for (i = 0; i < sbio->page_count; i++) {
1603 struct scrub_page *spage = sbio->pagev[i];
1604
1605 spage->io_error = 1;
1606 btrfs_dev_replace_stats_inc(&dev_replace->
1607 num_write_errors);
1608 }
1609 }
1610
1611 for (i = 0; i < sbio->page_count; i++)
1612 scrub_page_put(sbio->pagev[i]);
1613
1614 bio_put(sbio->bio);
1615 kfree(sbio);
1616 scrub_pending_bio_dec(sctx);
1617}
1618
1619static int scrub_checksum(struct scrub_block *sblock)
1241{ 1620{
1242 u64 flags; 1621 u64 flags;
1243 int ret; 1622 int ret;
1244 1623
1245 BUG_ON(sblock->page_count < 1); 1624 WARN_ON(sblock->page_count < 1);
1246 flags = sblock->pagev[0].flags; 1625 flags = sblock->pagev[0]->flags;
1247 ret = 0; 1626 ret = 0;
1248 if (flags & BTRFS_EXTENT_FLAG_DATA) 1627 if (flags & BTRFS_EXTENT_FLAG_DATA)
1249 ret = scrub_checksum_data(sblock); 1628 ret = scrub_checksum_data(sblock);
@@ -1255,30 +1634,32 @@ static void scrub_checksum(struct scrub_block *sblock)
1255 WARN_ON(1); 1634 WARN_ON(1);
1256 if (ret) 1635 if (ret)
1257 scrub_handle_errored_block(sblock); 1636 scrub_handle_errored_block(sblock);
1637
1638 return ret;
1258} 1639}
1259 1640
1260static int scrub_checksum_data(struct scrub_block *sblock) 1641static int scrub_checksum_data(struct scrub_block *sblock)
1261{ 1642{
1262 struct scrub_dev *sdev = sblock->sdev; 1643 struct scrub_ctx *sctx = sblock->sctx;
1263 u8 csum[BTRFS_CSUM_SIZE]; 1644 u8 csum[BTRFS_CSUM_SIZE];
1264 u8 *on_disk_csum; 1645 u8 *on_disk_csum;
1265 struct page *page; 1646 struct page *page;
1266 void *buffer; 1647 void *buffer;
1267 u32 crc = ~(u32)0; 1648 u32 crc = ~(u32)0;
1268 int fail = 0; 1649 int fail = 0;
1269 struct btrfs_root *root = sdev->dev->dev_root; 1650 struct btrfs_root *root = sctx->dev_root;
1270 u64 len; 1651 u64 len;
1271 int index; 1652 int index;
1272 1653
1273 BUG_ON(sblock->page_count < 1); 1654 BUG_ON(sblock->page_count < 1);
1274 if (!sblock->pagev[0].have_csum) 1655 if (!sblock->pagev[0]->have_csum)
1275 return 0; 1656 return 0;
1276 1657
1277 on_disk_csum = sblock->pagev[0].csum; 1658 on_disk_csum = sblock->pagev[0]->csum;
1278 page = sblock->pagev[0].page; 1659 page = sblock->pagev[0]->page;
1279 buffer = kmap_atomic(page); 1660 buffer = kmap_atomic(page);
1280 1661
1281 len = sdev->sectorsize; 1662 len = sctx->sectorsize;
1282 index = 0; 1663 index = 0;
1283 for (;;) { 1664 for (;;) {
1284 u64 l = min_t(u64, len, PAGE_SIZE); 1665 u64 l = min_t(u64, len, PAGE_SIZE);
@@ -1290,13 +1671,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1290 break; 1671 break;
1291 index++; 1672 index++;
1292 BUG_ON(index >= sblock->page_count); 1673 BUG_ON(index >= sblock->page_count);
1293 BUG_ON(!sblock->pagev[index].page); 1674 BUG_ON(!sblock->pagev[index]->page);
1294 page = sblock->pagev[index].page; 1675 page = sblock->pagev[index]->page;
1295 buffer = kmap_atomic(page); 1676 buffer = kmap_atomic(page);
1296 } 1677 }
1297 1678
1298 btrfs_csum_final(crc, csum); 1679 btrfs_csum_final(crc, csum);
1299 if (memcmp(csum, on_disk_csum, sdev->csum_size)) 1680 if (memcmp(csum, on_disk_csum, sctx->csum_size))
1300 fail = 1; 1681 fail = 1;
1301 1682
1302 return fail; 1683 return fail;
@@ -1304,9 +1685,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1304 1685
1305static int scrub_checksum_tree_block(struct scrub_block *sblock) 1686static int scrub_checksum_tree_block(struct scrub_block *sblock)
1306{ 1687{
1307 struct scrub_dev *sdev = sblock->sdev; 1688 struct scrub_ctx *sctx = sblock->sctx;
1308 struct btrfs_header *h; 1689 struct btrfs_header *h;
1309 struct btrfs_root *root = sdev->dev->dev_root; 1690 struct btrfs_root *root = sctx->dev_root;
1310 struct btrfs_fs_info *fs_info = root->fs_info; 1691 struct btrfs_fs_info *fs_info = root->fs_info;
1311 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1692 u8 calculated_csum[BTRFS_CSUM_SIZE];
1312 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1693 u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1321,10 +1702,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1321 int index; 1702 int index;
1322 1703
1323 BUG_ON(sblock->page_count < 1); 1704 BUG_ON(sblock->page_count < 1);
1324 page = sblock->pagev[0].page; 1705 page = sblock->pagev[0]->page;
1325 mapped_buffer = kmap_atomic(page); 1706 mapped_buffer = kmap_atomic(page);
1326 h = (struct btrfs_header *)mapped_buffer; 1707 h = (struct btrfs_header *)mapped_buffer;
1327 memcpy(on_disk_csum, h->csum, sdev->csum_size); 1708 memcpy(on_disk_csum, h->csum, sctx->csum_size);
1328 1709
1329 /* 1710 /*
1330 * we don't use the getter functions here, as we 1711 * we don't use the getter functions here, as we
@@ -1332,10 +1713,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1332 * b) the page is already kmapped 1713 * b) the page is already kmapped
1333 */ 1714 */
1334 1715
1335 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) 1716 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
1336 ++fail; 1717 ++fail;
1337 1718
1338 if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) 1719 if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
1339 ++fail; 1720 ++fail;
1340 1721
1341 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1722 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1345,8 +1726,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1345 BTRFS_UUID_SIZE)) 1726 BTRFS_UUID_SIZE))
1346 ++fail; 1727 ++fail;
1347 1728
1348 BUG_ON(sdev->nodesize != sdev->leafsize); 1729 WARN_ON(sctx->nodesize != sctx->leafsize);
1349 len = sdev->nodesize - BTRFS_CSUM_SIZE; 1730 len = sctx->nodesize - BTRFS_CSUM_SIZE;
1350 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1731 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1351 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1732 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1352 index = 0; 1733 index = 0;
@@ -1360,15 +1741,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1360 break; 1741 break;
1361 index++; 1742 index++;
1362 BUG_ON(index >= sblock->page_count); 1743 BUG_ON(index >= sblock->page_count);
1363 BUG_ON(!sblock->pagev[index].page); 1744 BUG_ON(!sblock->pagev[index]->page);
1364 page = sblock->pagev[index].page; 1745 page = sblock->pagev[index]->page;
1365 mapped_buffer = kmap_atomic(page); 1746 mapped_buffer = kmap_atomic(page);
1366 mapped_size = PAGE_SIZE; 1747 mapped_size = PAGE_SIZE;
1367 p = mapped_buffer; 1748 p = mapped_buffer;
1368 } 1749 }
1369 1750
1370 btrfs_csum_final(crc, calculated_csum); 1751 btrfs_csum_final(crc, calculated_csum);
1371 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1752 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1372 ++crc_fail; 1753 ++crc_fail;
1373 1754
1374 return fail || crc_fail; 1755 return fail || crc_fail;
@@ -1377,8 +1758,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1377static int scrub_checksum_super(struct scrub_block *sblock) 1758static int scrub_checksum_super(struct scrub_block *sblock)
1378{ 1759{
1379 struct btrfs_super_block *s; 1760 struct btrfs_super_block *s;
1380 struct scrub_dev *sdev = sblock->sdev; 1761 struct scrub_ctx *sctx = sblock->sctx;
1381 struct btrfs_root *root = sdev->dev->dev_root; 1762 struct btrfs_root *root = sctx->dev_root;
1382 struct btrfs_fs_info *fs_info = root->fs_info; 1763 struct btrfs_fs_info *fs_info = root->fs_info;
1383 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1764 u8 calculated_csum[BTRFS_CSUM_SIZE];
1384 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1765 u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1393,15 +1774,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1393 int index; 1774 int index;
1394 1775
1395 BUG_ON(sblock->page_count < 1); 1776 BUG_ON(sblock->page_count < 1);
1396 page = sblock->pagev[0].page; 1777 page = sblock->pagev[0]->page;
1397 mapped_buffer = kmap_atomic(page); 1778 mapped_buffer = kmap_atomic(page);
1398 s = (struct btrfs_super_block *)mapped_buffer; 1779 s = (struct btrfs_super_block *)mapped_buffer;
1399 memcpy(on_disk_csum, s->csum, sdev->csum_size); 1780 memcpy(on_disk_csum, s->csum, sctx->csum_size);
1400 1781
1401 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1782 if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
1402 ++fail_cor; 1783 ++fail_cor;
1403 1784
1404 if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1785 if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
1405 ++fail_gen; 1786 ++fail_gen;
1406 1787
1407 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1788 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1421,15 +1802,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1421 break; 1802 break;
1422 index++; 1803 index++;
1423 BUG_ON(index >= sblock->page_count); 1804 BUG_ON(index >= sblock->page_count);
1424 BUG_ON(!sblock->pagev[index].page); 1805 BUG_ON(!sblock->pagev[index]->page);
1425 page = sblock->pagev[index].page; 1806 page = sblock->pagev[index]->page;
1426 mapped_buffer = kmap_atomic(page); 1807 mapped_buffer = kmap_atomic(page);
1427 mapped_size = PAGE_SIZE; 1808 mapped_size = PAGE_SIZE;
1428 p = mapped_buffer; 1809 p = mapped_buffer;
1429 } 1810 }
1430 1811
1431 btrfs_csum_final(crc, calculated_csum); 1812 btrfs_csum_final(crc, calculated_csum);
1432 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1813 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1433 ++fail_cor; 1814 ++fail_cor;
1434 1815
1435 if (fail_cor + fail_gen) { 1816 if (fail_cor + fail_gen) {
@@ -1438,14 +1819,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1438 * They will get written with the next transaction commit 1819 * They will get written with the next transaction commit
1439 * anyway 1820 * anyway
1440 */ 1821 */
1441 spin_lock(&sdev->stat_lock); 1822 spin_lock(&sctx->stat_lock);
1442 ++sdev->stat.super_errors; 1823 ++sctx->stat.super_errors;
1443 spin_unlock(&sdev->stat_lock); 1824 spin_unlock(&sctx->stat_lock);
1444 if (fail_cor) 1825 if (fail_cor)
1445 btrfs_dev_stat_inc_and_print(sdev->dev, 1826 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1446 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1827 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1447 else 1828 else
1448 btrfs_dev_stat_inc_and_print(sdev->dev, 1829 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1449 BTRFS_DEV_STAT_GENERATION_ERRS); 1830 BTRFS_DEV_STAT_GENERATION_ERRS);
1450 } 1831 }
1451 1832
@@ -1463,28 +1844,54 @@ static void scrub_block_put(struct scrub_block *sblock)
1463 int i; 1844 int i;
1464 1845
1465 for (i = 0; i < sblock->page_count; i++) 1846 for (i = 0; i < sblock->page_count; i++)
1466 if (sblock->pagev[i].page) 1847 scrub_page_put(sblock->pagev[i]);
1467 __free_page(sblock->pagev[i].page);
1468 kfree(sblock); 1848 kfree(sblock);
1469 } 1849 }
1470} 1850}
1471 1851
1472static void scrub_submit(struct scrub_dev *sdev) 1852static void scrub_page_get(struct scrub_page *spage)
1853{
1854 atomic_inc(&spage->ref_count);
1855}
1856
1857static void scrub_page_put(struct scrub_page *spage)
1858{
1859 if (atomic_dec_and_test(&spage->ref_count)) {
1860 if (spage->page)
1861 __free_page(spage->page);
1862 kfree(spage);
1863 }
1864}
1865
1866static void scrub_submit(struct scrub_ctx *sctx)
1473{ 1867{
1474 struct scrub_bio *sbio; 1868 struct scrub_bio *sbio;
1475 1869
1476 if (sdev->curr == -1) 1870 if (sctx->curr == -1)
1477 return; 1871 return;
1478 1872
1479 sbio = sdev->bios[sdev->curr]; 1873 sbio = sctx->bios[sctx->curr];
1480 sdev->curr = -1; 1874 sctx->curr = -1;
1481 atomic_inc(&sdev->in_flight); 1875 scrub_pending_bio_inc(sctx);
1482 1876
1483 btrfsic_submit_bio(READ, sbio->bio); 1877 if (!sbio->bio->bi_bdev) {
1878 /*
1879 * this case should not happen. If btrfs_map_block() is
1880 * wrong, it could happen for dev-replace operations on
1881 * missing devices when no mirrors are available, but in
1882 * this case it should already fail the mount.
1883 * This case is handled correctly (but _very_ slowly).
1884 */
1885 printk_ratelimited(KERN_WARNING
1886 "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1887 bio_endio(sbio->bio, -EIO);
1888 } else {
1889 btrfsic_submit_bio(READ, sbio->bio);
1890 }
1484} 1891}
1485 1892
1486static int scrub_add_page_to_bio(struct scrub_dev *sdev, 1893static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1487 struct scrub_page *spage) 1894 struct scrub_page *spage)
1488{ 1895{
1489 struct scrub_block *sblock = spage->sblock; 1896 struct scrub_block *sblock = spage->sblock;
1490 struct scrub_bio *sbio; 1897 struct scrub_bio *sbio;
@@ -1494,28 +1901,29 @@ again:
1494 /* 1901 /*
1495 * grab a fresh bio or wait for one to become available 1902 * grab a fresh bio or wait for one to become available
1496 */ 1903 */
1497 while (sdev->curr == -1) { 1904 while (sctx->curr == -1) {
1498 spin_lock(&sdev->list_lock); 1905 spin_lock(&sctx->list_lock);
1499 sdev->curr = sdev->first_free; 1906 sctx->curr = sctx->first_free;
1500 if (sdev->curr != -1) { 1907 if (sctx->curr != -1) {
1501 sdev->first_free = sdev->bios[sdev->curr]->next_free; 1908 sctx->first_free = sctx->bios[sctx->curr]->next_free;
1502 sdev->bios[sdev->curr]->next_free = -1; 1909 sctx->bios[sctx->curr]->next_free = -1;
1503 sdev->bios[sdev->curr]->page_count = 0; 1910 sctx->bios[sctx->curr]->page_count = 0;
1504 spin_unlock(&sdev->list_lock); 1911 spin_unlock(&sctx->list_lock);
1505 } else { 1912 } else {
1506 spin_unlock(&sdev->list_lock); 1913 spin_unlock(&sctx->list_lock);
1507 wait_event(sdev->list_wait, sdev->first_free != -1); 1914 wait_event(sctx->list_wait, sctx->first_free != -1);
1508 } 1915 }
1509 } 1916 }
1510 sbio = sdev->bios[sdev->curr]; 1917 sbio = sctx->bios[sctx->curr];
1511 if (sbio->page_count == 0) { 1918 if (sbio->page_count == 0) {
1512 struct bio *bio; 1919 struct bio *bio;
1513 1920
1514 sbio->physical = spage->physical; 1921 sbio->physical = spage->physical;
1515 sbio->logical = spage->logical; 1922 sbio->logical = spage->logical;
1923 sbio->dev = spage->dev;
1516 bio = sbio->bio; 1924 bio = sbio->bio;
1517 if (!bio) { 1925 if (!bio) {
1518 bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); 1926 bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1519 if (!bio) 1927 if (!bio)
1520 return -ENOMEM; 1928 return -ENOMEM;
1521 sbio->bio = bio; 1929 sbio->bio = bio;
@@ -1523,14 +1931,15 @@ again:
1523 1931
1524 bio->bi_private = sbio; 1932 bio->bi_private = sbio;
1525 bio->bi_end_io = scrub_bio_end_io; 1933 bio->bi_end_io = scrub_bio_end_io;
1526 bio->bi_bdev = sdev->dev->bdev; 1934 bio->bi_bdev = sbio->dev->bdev;
1527 bio->bi_sector = spage->physical >> 9; 1935 bio->bi_sector = sbio->physical >> 9;
1528 sbio->err = 0; 1936 sbio->err = 0;
1529 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1937 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1530 spage->physical || 1938 spage->physical ||
1531 sbio->logical + sbio->page_count * PAGE_SIZE != 1939 sbio->logical + sbio->page_count * PAGE_SIZE !=
1532 spage->logical) { 1940 spage->logical ||
1533 scrub_submit(sdev); 1941 sbio->dev != spage->dev) {
1942 scrub_submit(sctx);
1534 goto again; 1943 goto again;
1535 } 1944 }
1536 1945
@@ -1542,81 +1951,87 @@ again:
1542 sbio->bio = NULL; 1951 sbio->bio = NULL;
1543 return -EIO; 1952 return -EIO;
1544 } 1953 }
1545 scrub_submit(sdev); 1954 scrub_submit(sctx);
1546 goto again; 1955 goto again;
1547 } 1956 }
1548 1957
1549 scrub_block_get(sblock); /* one for the added page */ 1958 scrub_block_get(sblock); /* one for the page added to the bio */
1550 atomic_inc(&sblock->outstanding_pages); 1959 atomic_inc(&sblock->outstanding_pages);
1551 sbio->page_count++; 1960 sbio->page_count++;
1552 if (sbio->page_count == sdev->pages_per_bio) 1961 if (sbio->page_count == sctx->pages_per_rd_bio)
1553 scrub_submit(sdev); 1962 scrub_submit(sctx);
1554 1963
1555 return 0; 1964 return 0;
1556} 1965}
1557 1966
1558static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 1967static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1559 u64 physical, u64 flags, u64 gen, int mirror_num, 1968 u64 physical, struct btrfs_device *dev, u64 flags,
1560 u8 *csum, int force) 1969 u64 gen, int mirror_num, u8 *csum, int force,
1970 u64 physical_for_dev_replace)
1561{ 1971{
1562 struct scrub_block *sblock; 1972 struct scrub_block *sblock;
1563 int index; 1973 int index;
1564 1974
1565 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 1975 sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1566 if (!sblock) { 1976 if (!sblock) {
1567 spin_lock(&sdev->stat_lock); 1977 spin_lock(&sctx->stat_lock);
1568 sdev->stat.malloc_errors++; 1978 sctx->stat.malloc_errors++;
1569 spin_unlock(&sdev->stat_lock); 1979 spin_unlock(&sctx->stat_lock);
1570 return -ENOMEM; 1980 return -ENOMEM;
1571 } 1981 }
1572 1982
1573 /* one ref inside this function, plus one for each page later on */ 1983 /* one ref inside this function, plus one for each page added to
1984 * a bio later on */
1574 atomic_set(&sblock->ref_count, 1); 1985 atomic_set(&sblock->ref_count, 1);
1575 sblock->sdev = sdev; 1986 sblock->sctx = sctx;
1576 sblock->no_io_error_seen = 1; 1987 sblock->no_io_error_seen = 1;
1577 1988
1578 for (index = 0; len > 0; index++) { 1989 for (index = 0; len > 0; index++) {
1579 struct scrub_page *spage = sblock->pagev + index; 1990 struct scrub_page *spage;
1580 u64 l = min_t(u64, len, PAGE_SIZE); 1991 u64 l = min_t(u64, len, PAGE_SIZE);
1581 1992
1582 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 1993 spage = kzalloc(sizeof(*spage), GFP_NOFS);
1583 spage->page = alloc_page(GFP_NOFS); 1994 if (!spage) {
1584 if (!spage->page) { 1995leave_nomem:
1585 spin_lock(&sdev->stat_lock); 1996 spin_lock(&sctx->stat_lock);
1586 sdev->stat.malloc_errors++; 1997 sctx->stat.malloc_errors++;
1587 spin_unlock(&sdev->stat_lock); 1998 spin_unlock(&sctx->stat_lock);
1588 while (index > 0) { 1999 scrub_block_put(sblock);
1589 index--;
1590 __free_page(sblock->pagev[index].page);
1591 }
1592 kfree(sblock);
1593 return -ENOMEM; 2000 return -ENOMEM;
1594 } 2001 }
2002 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2003 scrub_page_get(spage);
2004 sblock->pagev[index] = spage;
1595 spage->sblock = sblock; 2005 spage->sblock = sblock;
1596 spage->dev = sdev->dev; 2006 spage->dev = dev;
1597 spage->flags = flags; 2007 spage->flags = flags;
1598 spage->generation = gen; 2008 spage->generation = gen;
1599 spage->logical = logical; 2009 spage->logical = logical;
1600 spage->physical = physical; 2010 spage->physical = physical;
2011 spage->physical_for_dev_replace = physical_for_dev_replace;
1601 spage->mirror_num = mirror_num; 2012 spage->mirror_num = mirror_num;
1602 if (csum) { 2013 if (csum) {
1603 spage->have_csum = 1; 2014 spage->have_csum = 1;
1604 memcpy(spage->csum, csum, sdev->csum_size); 2015 memcpy(spage->csum, csum, sctx->csum_size);
1605 } else { 2016 } else {
1606 spage->have_csum = 0; 2017 spage->have_csum = 0;
1607 } 2018 }
1608 sblock->page_count++; 2019 sblock->page_count++;
2020 spage->page = alloc_page(GFP_NOFS);
2021 if (!spage->page)
2022 goto leave_nomem;
1609 len -= l; 2023 len -= l;
1610 logical += l; 2024 logical += l;
1611 physical += l; 2025 physical += l;
2026 physical_for_dev_replace += l;
1612 } 2027 }
1613 2028
1614 BUG_ON(sblock->page_count == 0); 2029 WARN_ON(sblock->page_count == 0);
1615 for (index = 0; index < sblock->page_count; index++) { 2030 for (index = 0; index < sblock->page_count; index++) {
1616 struct scrub_page *spage = sblock->pagev + index; 2031 struct scrub_page *spage = sblock->pagev[index];
1617 int ret; 2032 int ret;
1618 2033
1619 ret = scrub_add_page_to_bio(sdev, spage); 2034 ret = scrub_add_page_to_rd_bio(sctx, spage);
1620 if (ret) { 2035 if (ret) {
1621 scrub_block_put(sblock); 2036 scrub_block_put(sblock);
1622 return ret; 2037 return ret;
@@ -1624,7 +2039,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1624 } 2039 }
1625 2040
1626 if (force) 2041 if (force)
1627 scrub_submit(sdev); 2042 scrub_submit(sctx);
1628 2043
1629 /* last one frees, either here or in bio completion for last page */ 2044 /* last one frees, either here or in bio completion for last page */
1630 scrub_block_put(sblock); 2045 scrub_block_put(sblock);
@@ -1634,8 +2049,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1634static void scrub_bio_end_io(struct bio *bio, int err) 2049static void scrub_bio_end_io(struct bio *bio, int err)
1635{ 2050{
1636 struct scrub_bio *sbio = bio->bi_private; 2051 struct scrub_bio *sbio = bio->bi_private;
1637 struct scrub_dev *sdev = sbio->sdev; 2052 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1638 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
1639 2053
1640 sbio->err = err; 2054 sbio->err = err;
1641 sbio->bio = bio; 2055 sbio->bio = bio;
@@ -1646,10 +2060,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)
1646static void scrub_bio_end_io_worker(struct btrfs_work *work) 2060static void scrub_bio_end_io_worker(struct btrfs_work *work)
1647{ 2061{
1648 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2062 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1649 struct scrub_dev *sdev = sbio->sdev; 2063 struct scrub_ctx *sctx = sbio->sctx;
1650 int i; 2064 int i;
1651 2065
1652 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); 2066 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
1653 if (sbio->err) { 2067 if (sbio->err) {
1654 for (i = 0; i < sbio->page_count; i++) { 2068 for (i = 0; i < sbio->page_count; i++) {
1655 struct scrub_page *spage = sbio->pagev[i]; 2069 struct scrub_page *spage = sbio->pagev[i];
@@ -1671,23 +2085,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
1671 2085
1672 bio_put(sbio->bio); 2086 bio_put(sbio->bio);
1673 sbio->bio = NULL; 2087 sbio->bio = NULL;
1674 spin_lock(&sdev->list_lock); 2088 spin_lock(&sctx->list_lock);
1675 sbio->next_free = sdev->first_free; 2089 sbio->next_free = sctx->first_free;
1676 sdev->first_free = sbio->index; 2090 sctx->first_free = sbio->index;
1677 spin_unlock(&sdev->list_lock); 2091 spin_unlock(&sctx->list_lock);
1678 atomic_dec(&sdev->in_flight); 2092
1679 wake_up(&sdev->list_wait); 2093 if (sctx->is_dev_replace &&
2094 atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2095 mutex_lock(&sctx->wr_ctx.wr_lock);
2096 scrub_wr_submit(sctx);
2097 mutex_unlock(&sctx->wr_ctx.wr_lock);
2098 }
2099
2100 scrub_pending_bio_dec(sctx);
1680} 2101}
1681 2102
1682static void scrub_block_complete(struct scrub_block *sblock) 2103static void scrub_block_complete(struct scrub_block *sblock)
1683{ 2104{
1684 if (!sblock->no_io_error_seen) 2105 if (!sblock->no_io_error_seen) {
1685 scrub_handle_errored_block(sblock); 2106 scrub_handle_errored_block(sblock);
1686 else 2107 } else {
1687 scrub_checksum(sblock); 2108 /*
2109 * if has checksum error, write via repair mechanism in
2110 * dev replace case, otherwise write here in dev replace
2111 * case.
2112 */
2113 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2114 scrub_write_block_to_dev_replace(sblock);
2115 }
1688} 2116}
1689 2117
1690static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, 2118static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
1691 u8 *csum) 2119 u8 *csum)
1692{ 2120{
1693 struct btrfs_ordered_sum *sum = NULL; 2121 struct btrfs_ordered_sum *sum = NULL;
@@ -1695,15 +2123,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1695 unsigned long i; 2123 unsigned long i;
1696 unsigned long num_sectors; 2124 unsigned long num_sectors;
1697 2125
1698 while (!list_empty(&sdev->csum_list)) { 2126 while (!list_empty(&sctx->csum_list)) {
1699 sum = list_first_entry(&sdev->csum_list, 2127 sum = list_first_entry(&sctx->csum_list,
1700 struct btrfs_ordered_sum, list); 2128 struct btrfs_ordered_sum, list);
1701 if (sum->bytenr > logical) 2129 if (sum->bytenr > logical)
1702 return 0; 2130 return 0;
1703 if (sum->bytenr + sum->len > logical) 2131 if (sum->bytenr + sum->len > logical)
1704 break; 2132 break;
1705 2133
1706 ++sdev->stat.csum_discards; 2134 ++sctx->stat.csum_discards;
1707 list_del(&sum->list); 2135 list_del(&sum->list);
1708 kfree(sum); 2136 kfree(sum);
1709 sum = NULL; 2137 sum = NULL;
@@ -1711,10 +2139,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1711 if (!sum) 2139 if (!sum)
1712 return 0; 2140 return 0;
1713 2141
1714 num_sectors = sum->len / sdev->sectorsize; 2142 num_sectors = sum->len / sctx->sectorsize;
1715 for (i = 0; i < num_sectors; ++i) { 2143 for (i = 0; i < num_sectors; ++i) {
1716 if (sum->sums[i].bytenr == logical) { 2144 if (sum->sums[i].bytenr == logical) {
1717 memcpy(csum, &sum->sums[i].sum, sdev->csum_size); 2145 memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
1718 ret = 1; 2146 ret = 1;
1719 break; 2147 break;
1720 } 2148 }
@@ -1727,29 +2155,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1727} 2155}
1728 2156
1729/* scrub extent tries to collect up to 64 kB for each bio */ 2157/* scrub extent tries to collect up to 64 kB for each bio */
1730static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 2158static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1731 u64 physical, u64 flags, u64 gen, int mirror_num) 2159 u64 physical, struct btrfs_device *dev, u64 flags,
2160 u64 gen, int mirror_num, u64 physical_for_dev_replace)
1732{ 2161{
1733 int ret; 2162 int ret;
1734 u8 csum[BTRFS_CSUM_SIZE]; 2163 u8 csum[BTRFS_CSUM_SIZE];
1735 u32 blocksize; 2164 u32 blocksize;
1736 2165
1737 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2166 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1738 blocksize = sdev->sectorsize; 2167 blocksize = sctx->sectorsize;
1739 spin_lock(&sdev->stat_lock); 2168 spin_lock(&sctx->stat_lock);
1740 sdev->stat.data_extents_scrubbed++; 2169 sctx->stat.data_extents_scrubbed++;
1741 sdev->stat.data_bytes_scrubbed += len; 2170 sctx->stat.data_bytes_scrubbed += len;
1742 spin_unlock(&sdev->stat_lock); 2171 spin_unlock(&sctx->stat_lock);
1743 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2172 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1744 BUG_ON(sdev->nodesize != sdev->leafsize); 2173 WARN_ON(sctx->nodesize != sctx->leafsize);
1745 blocksize = sdev->nodesize; 2174 blocksize = sctx->nodesize;
1746 spin_lock(&sdev->stat_lock); 2175 spin_lock(&sctx->stat_lock);
1747 sdev->stat.tree_extents_scrubbed++; 2176 sctx->stat.tree_extents_scrubbed++;
1748 sdev->stat.tree_bytes_scrubbed += len; 2177 sctx->stat.tree_bytes_scrubbed += len;
1749 spin_unlock(&sdev->stat_lock); 2178 spin_unlock(&sctx->stat_lock);
1750 } else { 2179 } else {
1751 blocksize = sdev->sectorsize; 2180 blocksize = sctx->sectorsize;
1752 BUG_ON(1); 2181 WARN_ON(1);
1753 } 2182 }
1754 2183
1755 while (len) { 2184 while (len) {
@@ -1758,26 +2187,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
1758 2187
1759 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2188 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1760 /* push csums to sbio */ 2189 /* push csums to sbio */
1761 have_csum = scrub_find_csum(sdev, logical, l, csum); 2190 have_csum = scrub_find_csum(sctx, logical, l, csum);
1762 if (have_csum == 0) 2191 if (have_csum == 0)
1763 ++sdev->stat.no_csum; 2192 ++sctx->stat.no_csum;
2193 if (sctx->is_dev_replace && !have_csum) {
2194 ret = copy_nocow_pages(sctx, logical, l,
2195 mirror_num,
2196 physical_for_dev_replace);
2197 goto behind_scrub_pages;
2198 }
1764 } 2199 }
1765 ret = scrub_pages(sdev, logical, l, physical, flags, gen, 2200 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
1766 mirror_num, have_csum ? csum : NULL, 0); 2201 mirror_num, have_csum ? csum : NULL, 0,
2202 physical_for_dev_replace);
2203behind_scrub_pages:
1767 if (ret) 2204 if (ret)
1768 return ret; 2205 return ret;
1769 len -= l; 2206 len -= l;
1770 logical += l; 2207 logical += l;
1771 physical += l; 2208 physical += l;
2209 physical_for_dev_replace += l;
1772 } 2210 }
1773 return 0; 2211 return 0;
1774} 2212}
1775 2213
1776static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, 2214static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1777 struct map_lookup *map, int num, u64 base, u64 length) 2215 struct map_lookup *map,
2216 struct btrfs_device *scrub_dev,
2217 int num, u64 base, u64 length,
2218 int is_dev_replace)
1778{ 2219{
1779 struct btrfs_path *path; 2220 struct btrfs_path *path;
1780 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 2221 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1781 struct btrfs_root *root = fs_info->extent_root; 2222 struct btrfs_root *root = fs_info->extent_root;
1782 struct btrfs_root *csum_root = fs_info->csum_root; 2223 struct btrfs_root *csum_root = fs_info->csum_root;
1783 struct btrfs_extent_item *extent; 2224 struct btrfs_extent_item *extent;
@@ -1797,9 +2238,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1797 struct reada_control *reada2; 2238 struct reada_control *reada2;
1798 struct btrfs_key key_start; 2239 struct btrfs_key key_start;
1799 struct btrfs_key key_end; 2240 struct btrfs_key key_end;
1800
1801 u64 increment = map->stripe_len; 2241 u64 increment = map->stripe_len;
1802 u64 offset; 2242 u64 offset;
2243 u64 extent_logical;
2244 u64 extent_physical;
2245 u64 extent_len;
2246 struct btrfs_device *extent_dev;
2247 int extent_mirror_num;
1803 2248
1804 nstripes = length; 2249 nstripes = length;
1805 offset = 0; 2250 offset = 0;
@@ -1843,8 +2288,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1843 */ 2288 */
1844 logical = base + offset; 2289 logical = base + offset;
1845 2290
1846 wait_event(sdev->list_wait, 2291 wait_event(sctx->list_wait,
1847 atomic_read(&sdev->in_flight) == 0); 2292 atomic_read(&sctx->bios_in_flight) == 0);
1848 atomic_inc(&fs_info->scrubs_paused); 2293 atomic_inc(&fs_info->scrubs_paused);
1849 wake_up(&fs_info->scrub_pause_wait); 2294 wake_up(&fs_info->scrub_pause_wait);
1850 2295
@@ -1898,7 +2343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1898 * canceled? 2343 * canceled?
1899 */ 2344 */
1900 if (atomic_read(&fs_info->scrub_cancel_req) || 2345 if (atomic_read(&fs_info->scrub_cancel_req) ||
1901 atomic_read(&sdev->cancel_req)) { 2346 atomic_read(&sctx->cancel_req)) {
1902 ret = -ECANCELED; 2347 ret = -ECANCELED;
1903 goto out; 2348 goto out;
1904 } 2349 }
@@ -1907,9 +2352,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1907 */ 2352 */
1908 if (atomic_read(&fs_info->scrub_pause_req)) { 2353 if (atomic_read(&fs_info->scrub_pause_req)) {
1909 /* push queued extents */ 2354 /* push queued extents */
1910 scrub_submit(sdev); 2355 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
1911 wait_event(sdev->list_wait, 2356 scrub_submit(sctx);
1912 atomic_read(&sdev->in_flight) == 0); 2357 mutex_lock(&sctx->wr_ctx.wr_lock);
2358 scrub_wr_submit(sctx);
2359 mutex_unlock(&sctx->wr_ctx.wr_lock);
2360 wait_event(sctx->list_wait,
2361 atomic_read(&sctx->bios_in_flight) == 0);
2362 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
1913 atomic_inc(&fs_info->scrubs_paused); 2363 atomic_inc(&fs_info->scrubs_paused);
1914 wake_up(&fs_info->scrub_pause_wait); 2364 wake_up(&fs_info->scrub_pause_wait);
1915 mutex_lock(&fs_info->scrub_lock); 2365 mutex_lock(&fs_info->scrub_lock);
@@ -1926,7 +2376,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1926 2376
1927 ret = btrfs_lookup_csums_range(csum_root, logical, 2377 ret = btrfs_lookup_csums_range(csum_root, logical,
1928 logical + map->stripe_len - 1, 2378 logical + map->stripe_len - 1,
1929 &sdev->csum_list, 1); 2379 &sctx->csum_list, 1);
1930 if (ret) 2380 if (ret)
1931 goto out; 2381 goto out;
1932 2382
@@ -2004,9 +2454,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
2004 key.objectid; 2454 key.objectid;
2005 } 2455 }
2006 2456
2007 ret = scrub_extent(sdev, key.objectid, key.offset, 2457 extent_logical = key.objectid;
2008 key.objectid - logical + physical, 2458 extent_physical = key.objectid - logical + physical;
2009 flags, generation, mirror_num); 2459 extent_len = key.offset;
2460 extent_dev = scrub_dev;
2461 extent_mirror_num = mirror_num;
2462 if (is_dev_replace)
2463 scrub_remap_extent(fs_info, extent_logical,
2464 extent_len, &extent_physical,
2465 &extent_dev,
2466 &extent_mirror_num);
2467 ret = scrub_extent(sctx, extent_logical, extent_len,
2468 extent_physical, extent_dev, flags,
2469 generation, extent_mirror_num,
2470 key.objectid - logical + physical);
2010 if (ret) 2471 if (ret)
2011 goto out; 2472 goto out;
2012 2473
@@ -2016,29 +2477,34 @@ next:
2016 btrfs_release_path(path); 2477 btrfs_release_path(path);
2017 logical += increment; 2478 logical += increment;
2018 physical += map->stripe_len; 2479 physical += map->stripe_len;
2019 spin_lock(&sdev->stat_lock); 2480 spin_lock(&sctx->stat_lock);
2020 sdev->stat.last_physical = physical; 2481 sctx->stat.last_physical = physical;
2021 spin_unlock(&sdev->stat_lock); 2482 spin_unlock(&sctx->stat_lock);
2022 } 2483 }
2484out:
2023 /* push queued extents */ 2485 /* push queued extents */
2024 scrub_submit(sdev); 2486 scrub_submit(sctx);
2487 mutex_lock(&sctx->wr_ctx.wr_lock);
2488 scrub_wr_submit(sctx);
2489 mutex_unlock(&sctx->wr_ctx.wr_lock);
2025 2490
2026out:
2027 blk_finish_plug(&plug); 2491 blk_finish_plug(&plug);
2028 btrfs_free_path(path); 2492 btrfs_free_path(path);
2029 return ret < 0 ? ret : 0; 2493 return ret < 0 ? ret : 0;
2030} 2494}
2031 2495
2032static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, 2496static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2033 u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, 2497 struct btrfs_device *scrub_dev,
2034 u64 dev_offset) 2498 u64 chunk_tree, u64 chunk_objectid,
2499 u64 chunk_offset, u64 length,
2500 u64 dev_offset, int is_dev_replace)
2035{ 2501{
2036 struct btrfs_mapping_tree *map_tree = 2502 struct btrfs_mapping_tree *map_tree =
2037 &sdev->dev->dev_root->fs_info->mapping_tree; 2503 &sctx->dev_root->fs_info->mapping_tree;
2038 struct map_lookup *map; 2504 struct map_lookup *map;
2039 struct extent_map *em; 2505 struct extent_map *em;
2040 int i; 2506 int i;
2041 int ret = -EINVAL; 2507 int ret = 0;
2042 2508
2043 read_lock(&map_tree->map_tree.lock); 2509 read_lock(&map_tree->map_tree.lock);
2044 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2510 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -2055,9 +2521,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
2055 goto out; 2521 goto out;
2056 2522
2057 for (i = 0; i < map->num_stripes; ++i) { 2523 for (i = 0; i < map->num_stripes; ++i) {
2058 if (map->stripes[i].dev == sdev->dev && 2524 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2059 map->stripes[i].physical == dev_offset) { 2525 map->stripes[i].physical == dev_offset) {
2060 ret = scrub_stripe(sdev, map, i, chunk_offset, length); 2526 ret = scrub_stripe(sctx, map, scrub_dev, i,
2527 chunk_offset, length,
2528 is_dev_replace);
2061 if (ret) 2529 if (ret)
2062 goto out; 2530 goto out;
2063 } 2531 }
@@ -2069,11 +2537,13 @@ out:
2069} 2537}
2070 2538
2071static noinline_for_stack 2539static noinline_for_stack
2072int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) 2540int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2541 struct btrfs_device *scrub_dev, u64 start, u64 end,
2542 int is_dev_replace)
2073{ 2543{
2074 struct btrfs_dev_extent *dev_extent = NULL; 2544 struct btrfs_dev_extent *dev_extent = NULL;
2075 struct btrfs_path *path; 2545 struct btrfs_path *path;
2076 struct btrfs_root *root = sdev->dev->dev_root; 2546 struct btrfs_root *root = sctx->dev_root;
2077 struct btrfs_fs_info *fs_info = root->fs_info; 2547 struct btrfs_fs_info *fs_info = root->fs_info;
2078 u64 length; 2548 u64 length;
2079 u64 chunk_tree; 2549 u64 chunk_tree;
@@ -2085,6 +2555,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2085 struct btrfs_key key; 2555 struct btrfs_key key;
2086 struct btrfs_key found_key; 2556 struct btrfs_key found_key;
2087 struct btrfs_block_group_cache *cache; 2557 struct btrfs_block_group_cache *cache;
2558 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2088 2559
2089 path = btrfs_alloc_path(); 2560 path = btrfs_alloc_path();
2090 if (!path) 2561 if (!path)
@@ -2094,11 +2565,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2094 path->search_commit_root = 1; 2565 path->search_commit_root = 1;
2095 path->skip_locking = 1; 2566 path->skip_locking = 1;
2096 2567
2097 key.objectid = sdev->dev->devid; 2568 key.objectid = scrub_dev->devid;
2098 key.offset = 0ull; 2569 key.offset = 0ull;
2099 key.type = BTRFS_DEV_EXTENT_KEY; 2570 key.type = BTRFS_DEV_EXTENT_KEY;
2100 2571
2101
2102 while (1) { 2572 while (1) {
2103 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2573 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2104 if (ret < 0) 2574 if (ret < 0)
@@ -2117,7 +2587,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2117 2587
2118 btrfs_item_key_to_cpu(l, &found_key, slot); 2588 btrfs_item_key_to_cpu(l, &found_key, slot);
2119 2589
2120 if (found_key.objectid != sdev->dev->devid) 2590 if (found_key.objectid != scrub_dev->devid)
2121 break; 2591 break;
2122 2592
2123 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) 2593 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
@@ -2151,11 +2621,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2151 ret = -ENOENT; 2621 ret = -ENOENT;
2152 break; 2622 break;
2153 } 2623 }
2154 ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, 2624 dev_replace->cursor_right = found_key.offset + length;
2155 chunk_offset, length, found_key.offset); 2625 dev_replace->cursor_left = found_key.offset;
2626 dev_replace->item_needs_writeback = 1;
2627 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2628 chunk_offset, length, found_key.offset,
2629 is_dev_replace);
2630
2631 /*
2632 * flush, submit all pending read and write bios, afterwards
2633 * wait for them.
2634 * Note that in the dev replace case, a read request causes
2635 * write requests that are submitted in the read completion
2636 * worker. Therefore in the current situation, it is required
2637 * that all write requests are flushed, so that all read and
2638 * write requests are really completed when bios_in_flight
2639 * changes to 0.
2640 */
2641 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2642 scrub_submit(sctx);
2643 mutex_lock(&sctx->wr_ctx.wr_lock);
2644 scrub_wr_submit(sctx);
2645 mutex_unlock(&sctx->wr_ctx.wr_lock);
2646
2647 wait_event(sctx->list_wait,
2648 atomic_read(&sctx->bios_in_flight) == 0);
2649 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2650 atomic_inc(&fs_info->scrubs_paused);
2651 wake_up(&fs_info->scrub_pause_wait);
2652 wait_event(sctx->list_wait,
2653 atomic_read(&sctx->workers_pending) == 0);
2654
2655 mutex_lock(&fs_info->scrub_lock);
2656 while (atomic_read(&fs_info->scrub_pause_req)) {
2657 mutex_unlock(&fs_info->scrub_lock);
2658 wait_event(fs_info->scrub_pause_wait,
2659 atomic_read(&fs_info->scrub_pause_req) == 0);
2660 mutex_lock(&fs_info->scrub_lock);
2661 }
2662 atomic_dec(&fs_info->scrubs_paused);
2663 mutex_unlock(&fs_info->scrub_lock);
2664 wake_up(&fs_info->scrub_pause_wait);
2665
2666 dev_replace->cursor_left = dev_replace->cursor_right;
2667 dev_replace->item_needs_writeback = 1;
2156 btrfs_put_block_group(cache); 2668 btrfs_put_block_group(cache);
2157 if (ret) 2669 if (ret)
2158 break; 2670 break;
2671 if (is_dev_replace &&
2672 atomic64_read(&dev_replace->num_write_errors) > 0) {
2673 ret = -EIO;
2674 break;
2675 }
2676 if (sctx->stat.malloc_errors > 0) {
2677 ret = -ENOMEM;
2678 break;
2679 }
2159 2680
2160 key.offset = found_key.offset + length; 2681 key.offset = found_key.offset + length;
2161 btrfs_release_path(path); 2682 btrfs_release_path(path);
@@ -2170,14 +2691,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2170 return ret < 0 ? ret : 0; 2691 return ret < 0 ? ret : 0;
2171} 2692}
2172 2693
2173static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) 2694static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2695 struct btrfs_device *scrub_dev)
2174{ 2696{
2175 int i; 2697 int i;
2176 u64 bytenr; 2698 u64 bytenr;
2177 u64 gen; 2699 u64 gen;
2178 int ret; 2700 int ret;
2179 struct btrfs_device *device = sdev->dev; 2701 struct btrfs_root *root = sctx->dev_root;
2180 struct btrfs_root *root = device->dev_root;
2181 2702
2182 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2703 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2183 return -EIO; 2704 return -EIO;
@@ -2186,15 +2707,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2186 2707
2187 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2708 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2188 bytenr = btrfs_sb_offset(i); 2709 bytenr = btrfs_sb_offset(i);
2189 if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) 2710 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2190 break; 2711 break;
2191 2712
2192 ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2713 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2193 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); 2714 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2715 NULL, 1, bytenr);
2194 if (ret) 2716 if (ret)
2195 return ret; 2717 return ret;
2196 } 2718 }
2197 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2719 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2198 2720
2199 return 0; 2721 return 0;
2200} 2722}
@@ -2202,19 +2724,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2202/* 2724/*
2203 * get a reference count on fs_info->scrub_workers. start worker if necessary 2725 * get a reference count on fs_info->scrub_workers. start worker if necessary
2204 */ 2726 */
2205static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 2727static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2728 int is_dev_replace)
2206{ 2729{
2207 struct btrfs_fs_info *fs_info = root->fs_info;
2208 int ret = 0; 2730 int ret = 0;
2209 2731
2210 mutex_lock(&fs_info->scrub_lock); 2732 mutex_lock(&fs_info->scrub_lock);
2211 if (fs_info->scrub_workers_refcnt == 0) { 2733 if (fs_info->scrub_workers_refcnt == 0) {
2212 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2734 if (is_dev_replace)
2213 fs_info->thread_pool_size, &fs_info->generic_worker); 2735 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2736 &fs_info->generic_worker);
2737 else
2738 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2739 fs_info->thread_pool_size,
2740 &fs_info->generic_worker);
2214 fs_info->scrub_workers.idle_thresh = 4; 2741 fs_info->scrub_workers.idle_thresh = 4;
2215 ret = btrfs_start_workers(&fs_info->scrub_workers); 2742 ret = btrfs_start_workers(&fs_info->scrub_workers);
2216 if (ret) 2743 if (ret)
2217 goto out; 2744 goto out;
2745 btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2746 "scrubwrc",
2747 fs_info->thread_pool_size,
2748 &fs_info->generic_worker);
2749 fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2750 ret = btrfs_start_workers(
2751 &fs_info->scrub_wr_completion_workers);
2752 if (ret)
2753 goto out;
2754 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2755 &fs_info->generic_worker);
2756 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2757 if (ret)
2758 goto out;
2218 } 2759 }
2219 ++fs_info->scrub_workers_refcnt; 2760 ++fs_info->scrub_workers_refcnt;
2220out: 2761out:
@@ -2223,40 +2764,41 @@ out:
2223 return ret; 2764 return ret;
2224} 2765}
2225 2766
2226static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 2767static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2227{ 2768{
2228 struct btrfs_fs_info *fs_info = root->fs_info;
2229
2230 mutex_lock(&fs_info->scrub_lock); 2769 mutex_lock(&fs_info->scrub_lock);
2231 if (--fs_info->scrub_workers_refcnt == 0) 2770 if (--fs_info->scrub_workers_refcnt == 0) {
2232 btrfs_stop_workers(&fs_info->scrub_workers); 2771 btrfs_stop_workers(&fs_info->scrub_workers);
2772 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2773 btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2774 }
2233 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2775 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2234 mutex_unlock(&fs_info->scrub_lock); 2776 mutex_unlock(&fs_info->scrub_lock);
2235} 2777}
2236 2778
2237 2779int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2238int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 2780 u64 end, struct btrfs_scrub_progress *progress,
2239 struct btrfs_scrub_progress *progress, int readonly) 2781 int readonly, int is_dev_replace)
2240{ 2782{
2241 struct scrub_dev *sdev; 2783 struct scrub_ctx *sctx;
2242 struct btrfs_fs_info *fs_info = root->fs_info;
2243 int ret; 2784 int ret;
2244 struct btrfs_device *dev; 2785 struct btrfs_device *dev;
2245 2786
2246 if (btrfs_fs_closing(root->fs_info)) 2787 if (btrfs_fs_closing(fs_info))
2247 return -EINVAL; 2788 return -EINVAL;
2248 2789
2249 /* 2790 /*
2250 * check some assumptions 2791 * check some assumptions
2251 */ 2792 */
2252 if (root->nodesize != root->leafsize) { 2793 if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2253 printk(KERN_ERR 2794 printk(KERN_ERR
2254 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", 2795 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2255 root->nodesize, root->leafsize); 2796 fs_info->chunk_root->nodesize,
2797 fs_info->chunk_root->leafsize);
2256 return -EINVAL; 2798 return -EINVAL;
2257 } 2799 }
2258 2800
2259 if (root->nodesize > BTRFS_STRIPE_LEN) { 2801 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2260 /* 2802 /*
2261 * in this case scrub is unable to calculate the checksum 2803 * in this case scrub is unable to calculate the checksum
2262 * the way scrub is implemented. Do not handle this 2804 * the way scrub is implemented. Do not handle this
@@ -2264,80 +2806,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
2264 */ 2806 */
2265 printk(KERN_ERR 2807 printk(KERN_ERR
2266 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", 2808 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2267 root->nodesize, BTRFS_STRIPE_LEN); 2809 fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2268 return -EINVAL; 2810 return -EINVAL;
2269 } 2811 }
2270 2812
2271 if (root->sectorsize != PAGE_SIZE) { 2813 if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2272 /* not supported for data w/o checksums */ 2814 /* not supported for data w/o checksums */
2273 printk(KERN_ERR 2815 printk(KERN_ERR
2274 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", 2816 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2275 root->sectorsize, (unsigned long long)PAGE_SIZE); 2817 fs_info->chunk_root->sectorsize,
2818 (unsigned long long)PAGE_SIZE);
2276 return -EINVAL; 2819 return -EINVAL;
2277 } 2820 }
2278 2821
2279 ret = scrub_workers_get(root); 2822 if (fs_info->chunk_root->nodesize >
2823 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2824 fs_info->chunk_root->sectorsize >
2825 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2826 /*
2827 * would exhaust the array bounds of pagev member in
2828 * struct scrub_block
2829 */
2830 pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
2831 fs_info->chunk_root->nodesize,
2832 SCRUB_MAX_PAGES_PER_BLOCK,
2833 fs_info->chunk_root->sectorsize,
2834 SCRUB_MAX_PAGES_PER_BLOCK);
2835 return -EINVAL;
2836 }
2837
2838 ret = scrub_workers_get(fs_info, is_dev_replace);
2280 if (ret) 2839 if (ret)
2281 return ret; 2840 return ret;
2282 2841
2283 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2842 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2284 dev = btrfs_find_device(root, devid, NULL, NULL); 2843 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2285 if (!dev || dev->missing) { 2844 if (!dev || (dev->missing && !is_dev_replace)) {
2286 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2845 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2287 scrub_workers_put(root); 2846 scrub_workers_put(fs_info);
2288 return -ENODEV; 2847 return -ENODEV;
2289 } 2848 }
2290 mutex_lock(&fs_info->scrub_lock); 2849 mutex_lock(&fs_info->scrub_lock);
2291 2850
2292 if (!dev->in_fs_metadata) { 2851 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2293 mutex_unlock(&fs_info->scrub_lock); 2852 mutex_unlock(&fs_info->scrub_lock);
2294 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2853 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2295 scrub_workers_put(root); 2854 scrub_workers_put(fs_info);
2296 return -ENODEV; 2855 return -EIO;
2297 } 2856 }
2298 2857
2299 if (dev->scrub_device) { 2858 btrfs_dev_replace_lock(&fs_info->dev_replace);
2859 if (dev->scrub_device ||
2860 (!is_dev_replace &&
2861 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2862 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2300 mutex_unlock(&fs_info->scrub_lock); 2863 mutex_unlock(&fs_info->scrub_lock);
2301 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2864 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2302 scrub_workers_put(root); 2865 scrub_workers_put(fs_info);
2303 return -EINPROGRESS; 2866 return -EINPROGRESS;
2304 } 2867 }
2305 sdev = scrub_setup_dev(dev); 2868 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2306 if (IS_ERR(sdev)) { 2869 sctx = scrub_setup_ctx(dev, is_dev_replace);
2870 if (IS_ERR(sctx)) {
2307 mutex_unlock(&fs_info->scrub_lock); 2871 mutex_unlock(&fs_info->scrub_lock);
2308 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2872 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2309 scrub_workers_put(root); 2873 scrub_workers_put(fs_info);
2310 return PTR_ERR(sdev); 2874 return PTR_ERR(sctx);
2311 } 2875 }
2312 sdev->readonly = readonly; 2876 sctx->readonly = readonly;
2313 dev->scrub_device = sdev; 2877 dev->scrub_device = sctx;
2314 2878
2315 atomic_inc(&fs_info->scrubs_running); 2879 atomic_inc(&fs_info->scrubs_running);
2316 mutex_unlock(&fs_info->scrub_lock); 2880 mutex_unlock(&fs_info->scrub_lock);
2317 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2881 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2318 2882
2319 down_read(&fs_info->scrub_super_lock); 2883 if (!is_dev_replace) {
2320 ret = scrub_supers(sdev); 2884 down_read(&fs_info->scrub_super_lock);
2321 up_read(&fs_info->scrub_super_lock); 2885 ret = scrub_supers(sctx, dev);
2886 up_read(&fs_info->scrub_super_lock);
2887 }
2322 2888
2323 if (!ret) 2889 if (!ret)
2324 ret = scrub_enumerate_chunks(sdev, start, end); 2890 ret = scrub_enumerate_chunks(sctx, dev, start, end,
2891 is_dev_replace);
2325 2892
2326 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2893 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2327 atomic_dec(&fs_info->scrubs_running); 2894 atomic_dec(&fs_info->scrubs_running);
2328 wake_up(&fs_info->scrub_pause_wait); 2895 wake_up(&fs_info->scrub_pause_wait);
2329 2896
2330 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); 2897 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2331 2898
2332 if (progress) 2899 if (progress)
2333 memcpy(progress, &sdev->stat, sizeof(*progress)); 2900 memcpy(progress, &sctx->stat, sizeof(*progress));
2334 2901
2335 mutex_lock(&fs_info->scrub_lock); 2902 mutex_lock(&fs_info->scrub_lock);
2336 dev->scrub_device = NULL; 2903 dev->scrub_device = NULL;
2337 mutex_unlock(&fs_info->scrub_lock); 2904 mutex_unlock(&fs_info->scrub_lock);
2338 2905
2339 scrub_free_dev(sdev); 2906 scrub_free_ctx(sctx);
2340 scrub_workers_put(root); 2907 scrub_workers_put(fs_info);
2341 2908
2342 return ret; 2909 return ret;
2343} 2910}
@@ -2377,9 +2944,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)
2377 up_write(&root->fs_info->scrub_super_lock); 2944 up_write(&root->fs_info->scrub_super_lock);
2378} 2945}
2379 2946
2380int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 2947int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2381{ 2948{
2382
2383 mutex_lock(&fs_info->scrub_lock); 2949 mutex_lock(&fs_info->scrub_lock);
2384 if (!atomic_read(&fs_info->scrubs_running)) { 2950 if (!atomic_read(&fs_info->scrubs_running)) {
2385 mutex_unlock(&fs_info->scrub_lock); 2951 mutex_unlock(&fs_info->scrub_lock);
@@ -2399,23 +2965,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2399 return 0; 2965 return 0;
2400} 2966}
2401 2967
2402int btrfs_scrub_cancel(struct btrfs_root *root) 2968int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
2969 struct btrfs_device *dev)
2403{ 2970{
2404 return __btrfs_scrub_cancel(root->fs_info); 2971 struct scrub_ctx *sctx;
2405}
2406
2407int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
2408{
2409 struct btrfs_fs_info *fs_info = root->fs_info;
2410 struct scrub_dev *sdev;
2411 2972
2412 mutex_lock(&fs_info->scrub_lock); 2973 mutex_lock(&fs_info->scrub_lock);
2413 sdev = dev->scrub_device; 2974 sctx = dev->scrub_device;
2414 if (!sdev) { 2975 if (!sctx) {
2415 mutex_unlock(&fs_info->scrub_lock); 2976 mutex_unlock(&fs_info->scrub_lock);
2416 return -ENOTCONN; 2977 return -ENOTCONN;
2417 } 2978 }
2418 atomic_inc(&sdev->cancel_req); 2979 atomic_inc(&sctx->cancel_req);
2419 while (dev->scrub_device) { 2980 while (dev->scrub_device) {
2420 mutex_unlock(&fs_info->scrub_lock); 2981 mutex_unlock(&fs_info->scrub_lock);
2421 wait_event(fs_info->scrub_pause_wait, 2982 wait_event(fs_info->scrub_pause_wait,
@@ -2438,12 +2999,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
2438 * does not go away in cancel_dev. FIXME: find a better solution 2999 * does not go away in cancel_dev. FIXME: find a better solution
2439 */ 3000 */
2440 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3001 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2441 dev = btrfs_find_device(root, devid, NULL, NULL); 3002 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2442 if (!dev) { 3003 if (!dev) {
2443 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3004 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2444 return -ENODEV; 3005 return -ENODEV;
2445 } 3006 }
2446 ret = btrfs_scrub_cancel_dev(root, dev); 3007 ret = btrfs_scrub_cancel_dev(fs_info, dev);
2447 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3008 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2448 3009
2449 return ret; 3010 return ret;
@@ -2453,15 +3014,284 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2453 struct btrfs_scrub_progress *progress) 3014 struct btrfs_scrub_progress *progress)
2454{ 3015{
2455 struct btrfs_device *dev; 3016 struct btrfs_device *dev;
2456 struct scrub_dev *sdev = NULL; 3017 struct scrub_ctx *sctx = NULL;
2457 3018
2458 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3019 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2459 dev = btrfs_find_device(root, devid, NULL, NULL); 3020 dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
2460 if (dev) 3021 if (dev)
2461 sdev = dev->scrub_device; 3022 sctx = dev->scrub_device;
2462 if (sdev) 3023 if (sctx)
2463 memcpy(progress, &sdev->stat, sizeof(*progress)); 3024 memcpy(progress, &sctx->stat, sizeof(*progress));
2464 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3025 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2465 3026
2466 return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; 3027 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3028}
3029
3030static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3031 u64 extent_logical, u64 extent_len,
3032 u64 *extent_physical,
3033 struct btrfs_device **extent_dev,
3034 int *extent_mirror_num)
3035{
3036 u64 mapped_length;
3037 struct btrfs_bio *bbio = NULL;
3038 int ret;
3039
3040 mapped_length = extent_len;
3041 ret = btrfs_map_block(fs_info, READ, extent_logical,
3042 &mapped_length, &bbio, 0);
3043 if (ret || !bbio || mapped_length < extent_len ||
3044 !bbio->stripes[0].dev->bdev) {
3045 kfree(bbio);
3046 return;
3047 }
3048
3049 *extent_physical = bbio->stripes[0].physical;
3050 *extent_mirror_num = bbio->mirror_num;
3051 *extent_dev = bbio->stripes[0].dev;
3052 kfree(bbio);
3053}
3054
3055static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3056 struct scrub_wr_ctx *wr_ctx,
3057 struct btrfs_fs_info *fs_info,
3058 struct btrfs_device *dev,
3059 int is_dev_replace)
3060{
3061 WARN_ON(wr_ctx->wr_curr_bio != NULL);
3062
3063 mutex_init(&wr_ctx->wr_lock);
3064 wr_ctx->wr_curr_bio = NULL;
3065 if (!is_dev_replace)
3066 return 0;
3067
3068 WARN_ON(!dev->bdev);
3069 wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3070 bio_get_nr_vecs(dev->bdev));
3071 wr_ctx->tgtdev = dev;
3072 atomic_set(&wr_ctx->flush_all_writes, 0);
3073 return 0;
3074}
3075
3076static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3077{
3078 mutex_lock(&wr_ctx->wr_lock);
3079 kfree(wr_ctx->wr_curr_bio);
3080 wr_ctx->wr_curr_bio = NULL;
3081 mutex_unlock(&wr_ctx->wr_lock);
3082}
3083
3084static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3085 int mirror_num, u64 physical_for_dev_replace)
3086{
3087 struct scrub_copy_nocow_ctx *nocow_ctx;
3088 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3089
3090 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3091 if (!nocow_ctx) {
3092 spin_lock(&sctx->stat_lock);
3093 sctx->stat.malloc_errors++;
3094 spin_unlock(&sctx->stat_lock);
3095 return -ENOMEM;
3096 }
3097
3098 scrub_pending_trans_workers_inc(sctx);
3099
3100 nocow_ctx->sctx = sctx;
3101 nocow_ctx->logical = logical;
3102 nocow_ctx->len = len;
3103 nocow_ctx->mirror_num = mirror_num;
3104 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3105 nocow_ctx->work.func = copy_nocow_pages_worker;
3106 btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3107 &nocow_ctx->work);
3108
3109 return 0;
3110}
3111
3112static void copy_nocow_pages_worker(struct btrfs_work *work)
3113{
3114 struct scrub_copy_nocow_ctx *nocow_ctx =
3115 container_of(work, struct scrub_copy_nocow_ctx, work);
3116 struct scrub_ctx *sctx = nocow_ctx->sctx;
3117 u64 logical = nocow_ctx->logical;
3118 u64 len = nocow_ctx->len;
3119 int mirror_num = nocow_ctx->mirror_num;
3120 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3121 int ret;
3122 struct btrfs_trans_handle *trans = NULL;
3123 struct btrfs_fs_info *fs_info;
3124 struct btrfs_path *path;
3125 struct btrfs_root *root;
3126 int not_written = 0;
3127
3128 fs_info = sctx->dev_root->fs_info;
3129 root = fs_info->extent_root;
3130
3131 path = btrfs_alloc_path();
3132 if (!path) {
3133 spin_lock(&sctx->stat_lock);
3134 sctx->stat.malloc_errors++;
3135 spin_unlock(&sctx->stat_lock);
3136 not_written = 1;
3137 goto out;
3138 }
3139
3140 trans = btrfs_join_transaction(root);
3141 if (IS_ERR(trans)) {
3142 not_written = 1;
3143 goto out;
3144 }
3145
3146 ret = iterate_inodes_from_logical(logical, fs_info, path,
3147 copy_nocow_pages_for_inode,
3148 nocow_ctx);
3149 if (ret != 0 && ret != -ENOENT) {
3150 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
3151 (unsigned long long)logical,
3152 (unsigned long long)physical_for_dev_replace,
3153 (unsigned long long)len,
3154 (unsigned long long)mirror_num, ret);
3155 not_written = 1;
3156 goto out;
3157 }
3158
3159out:
3160 if (trans && !IS_ERR(trans))
3161 btrfs_end_transaction(trans, root);
3162 if (not_written)
3163 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3164 num_uncorrectable_read_errors);
3165
3166 btrfs_free_path(path);
3167 kfree(nocow_ctx);
3168
3169 scrub_pending_trans_workers_dec(sctx);
3170}
3171
3172static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3173{
3174 unsigned long index;
3175 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3176 int ret = 0;
3177 struct btrfs_key key;
3178 struct inode *inode = NULL;
3179 struct btrfs_root *local_root;
3180 u64 physical_for_dev_replace;
3181 u64 len;
3182 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3183
3184 key.objectid = root;
3185 key.type = BTRFS_ROOT_ITEM_KEY;
3186 key.offset = (u64)-1;
3187 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3188 if (IS_ERR(local_root))
3189 return PTR_ERR(local_root);
3190
3191 key.type = BTRFS_INODE_ITEM_KEY;
3192 key.objectid = inum;
3193 key.offset = 0;
3194 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3195 if (IS_ERR(inode))
3196 return PTR_ERR(inode);
3197
3198 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3199 len = nocow_ctx->len;
3200 while (len >= PAGE_CACHE_SIZE) {
3201 struct page *page = NULL;
3202 int ret_sub;
3203
3204 index = offset >> PAGE_CACHE_SHIFT;
3205
3206 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3207 if (!page) {
3208 pr_err("find_or_create_page() failed\n");
3209 ret = -ENOMEM;
3210 goto next_page;
3211 }
3212
3213 if (PageUptodate(page)) {
3214 if (PageDirty(page))
3215 goto next_page;
3216 } else {
3217 ClearPageError(page);
3218 ret_sub = extent_read_full_page(&BTRFS_I(inode)->
3219 io_tree,
3220 page, btrfs_get_extent,
3221 nocow_ctx->mirror_num);
3222 if (ret_sub) {
3223 ret = ret_sub;
3224 goto next_page;
3225 }
3226 wait_on_page_locked(page);
3227 if (!PageUptodate(page)) {
3228 ret = -EIO;
3229 goto next_page;
3230 }
3231 }
3232 ret_sub = write_page_nocow(nocow_ctx->sctx,
3233 physical_for_dev_replace, page);
3234 if (ret_sub) {
3235 ret = ret_sub;
3236 goto next_page;
3237 }
3238
3239next_page:
3240 if (page) {
3241 unlock_page(page);
3242 put_page(page);
3243 }
3244 offset += PAGE_CACHE_SIZE;
3245 physical_for_dev_replace += PAGE_CACHE_SIZE;
3246 len -= PAGE_CACHE_SIZE;
3247 }
3248
3249 if (inode)
3250 iput(inode);
3251 return ret;
3252}
3253
3254static int write_page_nocow(struct scrub_ctx *sctx,
3255 u64 physical_for_dev_replace, struct page *page)
3256{
3257 struct bio *bio;
3258 struct btrfs_device *dev;
3259 int ret;
3260 DECLARE_COMPLETION_ONSTACK(compl);
3261
3262 dev = sctx->wr_ctx.tgtdev;
3263 if (!dev)
3264 return -EIO;
3265 if (!dev->bdev) {
3266 printk_ratelimited(KERN_WARNING
3267 "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3268 return -EIO;
3269 }
3270 bio = bio_alloc(GFP_NOFS, 1);
3271 if (!bio) {
3272 spin_lock(&sctx->stat_lock);
3273 sctx->stat.malloc_errors++;
3274 spin_unlock(&sctx->stat_lock);
3275 return -ENOMEM;
3276 }
3277 bio->bi_private = &compl;
3278 bio->bi_end_io = scrub_complete_bio_end_io;
3279 bio->bi_size = 0;
3280 bio->bi_sector = physical_for_dev_replace >> 9;
3281 bio->bi_bdev = dev->bdev;
3282 ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3283 if (ret != PAGE_CACHE_SIZE) {
3284leave_with_eio:
3285 bio_put(bio);
3286 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3287 return -EIO;
3288 }
3289 btrfsic_submit_bio(WRITE_SYNC, bio);
3290 wait_for_completion(&compl);
3291
3292 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
3293 goto leave_with_eio;
3294
3295 bio_put(bio);
3296 return 0;
2467} 3297}