aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/ctree.h37
-rw-r--r--fs/btrfs/disk-io.c12
-rw-r--r--fs/btrfs/file-item.c8
-rw-r--r--fs/btrfs/inode.c2
-rw-r--r--fs/btrfs/ioctl.h37
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/scrub.c1492
-rw-r--r--fs/btrfs/transaction.c3
-rw-r--r--fs/btrfs/tree-log.c6
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/btrfs/volumes.h6
12 files changed, 1600 insertions, 11 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 31610ea73aec..8fda3133c1b8 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,4 +7,4 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o 10 compression.o delayed-ref.o relocation.o scrub.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2e61fe1b6b8c..31141ba6072d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -23,6 +23,7 @@
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/highmem.h> 24#include <linux/highmem.h>
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/rwsem.h>
26#include <linux/completion.h> 27#include <linux/completion.h>
27#include <linux/backing-dev.h> 28#include <linux/backing-dev.h>
28#include <linux/wait.h> 29#include <linux/wait.h>
@@ -33,6 +34,7 @@
33#include "extent_io.h" 34#include "extent_io.h"
34#include "extent_map.h" 35#include "extent_map.h"
35#include "async-thread.h" 36#include "async-thread.h"
37#include "ioctl.h"
36 38
37struct btrfs_trans_handle; 39struct btrfs_trans_handle;
38struct btrfs_transaction; 40struct btrfs_transaction;
@@ -510,6 +512,12 @@ struct btrfs_extent_item_v0 {
510/* use full backrefs for extent pointers in the block */ 512/* use full backrefs for extent pointers in the block */
511#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) 513#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
512 514
515/*
516 * this flag is only used internally by scrub and may be changed at any time
517 * it is only declared here to avoid collisions
518 */
519#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48)
520
513struct btrfs_tree_block_info { 521struct btrfs_tree_block_info {
514 struct btrfs_disk_key key; 522 struct btrfs_disk_key key;
515 u8 level; 523 u8 level;
@@ -1077,6 +1085,17 @@ struct btrfs_fs_info {
1077 1085
1078 void *bdev_holder; 1086 void *bdev_holder;
1079 1087
1088 /* private scrub information */
1089 struct mutex scrub_lock;
1090 atomic_t scrubs_running;
1091 atomic_t scrub_pause_req;
1092 atomic_t scrubs_paused;
1093 atomic_t scrub_cancel_req;
1094 wait_queue_head_t scrub_pause_wait;
1095 struct rw_semaphore scrub_super_lock;
1096 int scrub_workers_refcnt;
1097 struct btrfs_workers scrub_workers;
1098
1080 /* filesystem state */ 1099 /* filesystem state */
1081 u64 fs_state; 1100 u64 fs_state;
1082}; 1101};
@@ -2472,8 +2491,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
2472int btrfs_csum_truncate(struct btrfs_trans_handle *trans, 2491int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
2473 struct btrfs_root *root, struct btrfs_path *path, 2492 struct btrfs_root *root, struct btrfs_path *path,
2474 u64 isize); 2493 u64 isize);
2475int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, 2494int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
2476 u64 end, struct list_head *list); 2495 struct list_head *list, int search_commit);
2477/* inode.c */ 2496/* inode.c */
2478 2497
2479/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ 2498/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
@@ -2637,4 +2656,18 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
2637 u64 *bytes_to_reserve); 2656 u64 *bytes_to_reserve);
2638void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 2657void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
2639 struct btrfs_pending_snapshot *pending); 2658 struct btrfs_pending_snapshot *pending);
2659
2660/* scrub.c */
2661int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
2662 struct btrfs_scrub_progress *progress);
2663int btrfs_scrub_pause(struct btrfs_root *root);
2664int btrfs_scrub_pause_super(struct btrfs_root *root);
2665int btrfs_scrub_continue(struct btrfs_root *root);
2666int btrfs_scrub_continue_super(struct btrfs_root *root);
2667int btrfs_scrub_cancel(struct btrfs_root *root);
2668int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
2669int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
2670int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2671 struct btrfs_scrub_progress *progress);
2672
2640#endif 2673#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fe5aec9b3924..e48e8095c61f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1773,6 +1773,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1773 INIT_LIST_HEAD(&fs_info->ordered_extents); 1773 INIT_LIST_HEAD(&fs_info->ordered_extents);
1774 spin_lock_init(&fs_info->ordered_extent_lock); 1774 spin_lock_init(&fs_info->ordered_extent_lock);
1775 1775
1776 mutex_init(&fs_info->scrub_lock);
1777 atomic_set(&fs_info->scrubs_running, 0);
1778 atomic_set(&fs_info->scrub_pause_req, 0);
1779 atomic_set(&fs_info->scrubs_paused, 0);
1780 atomic_set(&fs_info->scrub_cancel_req, 0);
1781 init_waitqueue_head(&fs_info->scrub_pause_wait);
1782 init_rwsem(&fs_info->scrub_super_lock);
1783 fs_info->scrub_workers_refcnt = 0;
1784 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
1785 fs_info->thread_pool_size, &fs_info->generic_worker);
1786
1776 sb->s_blocksize = 4096; 1787 sb->s_blocksize = 4096;
1777 sb->s_blocksize_bits = blksize_bits(4096); 1788 sb->s_blocksize_bits = blksize_bits(4096);
1778 sb->s_bdi = &fs_info->bdi; 1789 sb->s_bdi = &fs_info->bdi;
@@ -2599,6 +2610,7 @@ int close_ctree(struct btrfs_root *root)
2599 fs_info->closing = 1; 2610 fs_info->closing = 1;
2600 smp_mb(); 2611 smp_mb();
2601 2612
2613 btrfs_scrub_cancel(root);
2602 btrfs_put_block_group_cache(fs_info); 2614 btrfs_put_block_group_cache(fs_info);
2603 2615
2604 /* 2616 /*
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a6a9d4e8b491..39ca7c1250e7 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -266,7 +266,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
266} 266}
267 267
268int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 268int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
269 struct list_head *list) 269 struct list_head *list, int search_commit)
270{ 270{
271 struct btrfs_key key; 271 struct btrfs_key key;
272 struct btrfs_path *path; 272 struct btrfs_path *path;
@@ -283,6 +283,12 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
283 path = btrfs_alloc_path(); 283 path = btrfs_alloc_path();
284 BUG_ON(!path); 284 BUG_ON(!path);
285 285
286 if (search_commit) {
287 path->skip_locking = 1;
288 path->reada = 2;
289 path->search_commit_root = 1;
290 }
291
286 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 292 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
287 key.offset = start; 293 key.offset = start;
288 key.type = BTRFS_EXTENT_CSUM_KEY; 294 key.type = BTRFS_EXTENT_CSUM_KEY;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 870869aab0b8..27142446b30a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1007,7 +1007,7 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
1007 LIST_HEAD(list); 1007 LIST_HEAD(list);
1008 1008
1009 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, 1009 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1010 bytenr + num_bytes - 1, &list); 1010 bytenr + num_bytes - 1, &list, 0);
1011 if (ret == 0 && list_empty(&list)) 1011 if (ret == 0 && list_empty(&list))
1012 return 0; 1012 return 0;
1013 1013
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 8fb382167b13..37ac030d64b4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -42,6 +42,43 @@ struct btrfs_ioctl_vol_args_v2 {
42 char name[BTRFS_SUBVOL_NAME_MAX + 1]; 42 char name[BTRFS_SUBVOL_NAME_MAX + 1];
43}; 43};
44 44
45/*
46 * structure to report errors and progress to userspace, either as a
47 * result of a finished scrub, a canceled scrub or a progress inquiry
48 */
49struct btrfs_scrub_progress {
50 __u64 data_extents_scrubbed; /* # of data extents scrubbed */
51 __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */
52 __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */
53 __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */
54 __u64 read_errors; /* # of read errors encountered (EIO) */
55 __u64 csum_errors; /* # of failed csum checks */
56 __u64 verify_errors; /* # of occurences, where the metadata
57 * of a tree block did not match the
58 * expected values, like generation or
59 * logical */
60 __u64 no_csum; /* # of 4k data block for which no csum
61 * is present, probably the result of
62 * data written with nodatasum */
63 __u64 csum_discards; /* # of csum for which no data was found
64 * in the extent tree. */
65 __u64 super_errors; /* # of bad super blocks encountered */
66 __u64 malloc_errors; /* # of internal kmalloc errors. These
67 * will likely cause an incomplete
68 * scrub */
69 __u64 uncorrectable_errors; /* # of errors where either no intact
70 * copy was found or the writeback
71 * failed */
72 __u64 corrected_errors; /* # of errors corrected */
73 __u64 last_physical; /* last physical address scrubbed. In
74 * case a scrub was aborted, this can
75 * be used to restart the scrub */
76 __u64 unverified_errors; /* # of occurences where a read for a
77 * full (64k) bio failed, but the re-
78 * check succeeded for each 4k piece.
79 * Intermittent error. */
80};
81
45#define BTRFS_INO_LOOKUP_PATH_MAX 4080 82#define BTRFS_INO_LOOKUP_PATH_MAX 4080
46struct btrfs_ioctl_ino_lookup_args { 83struct btrfs_ioctl_ino_lookup_args {
47 __u64 treeid; 84 __u64 treeid;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 58250e09eb05..db1dffa9952b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4242,7 +4242,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4242 4242
4243 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; 4243 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4244 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, 4244 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
4245 disk_bytenr + len - 1, &list); 4245 disk_bytenr + len - 1, &list, 0);
4246 4246
4247 while (!list_empty(&list)) { 4247 while (!list_empty(&list)) {
4248 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4248 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
new file mode 100644
index 000000000000..70f9fa772ee9
--- /dev/null
+++ b/fs/btrfs/scrub.c
@@ -0,0 +1,1492 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include "ctree.h"
27#include "volumes.h"
28#include "disk-io.h"
29#include "ordered-data.h"
30
31/*
32 * This is only the first step towards a full-features scrub. It reads all
33 * extent and super block and verifies the checksums. In case a bad checksum
34 * is found or the extent cannot be read, good data will be written back if
35 * any can be found.
36 *
37 * Future enhancements:
38 * - To enhance the performance, better read-ahead strategies for the
39 * extent-tree can be employed.
40 * - In case an unrepairable extent is encountered, track which files are
41 * affected and report them
42 * - In case of a read error on files with nodatasum, map the file and read
43 * the extent to trigger a writeback of the good copy
44 * - track and record media errors, throw out bad devices
45 * - add a readonly mode
46 * - add a mode to also read unallocated space
47 * - make the prefetch cancellable
48 */
49
50struct scrub_bio;
51struct scrub_page;
52struct scrub_dev;
53struct scrub_fixup;
54static void scrub_bio_end_io(struct bio *bio, int err);
55static void scrub_checksum(struct btrfs_work *work);
56static int scrub_checksum_data(struct scrub_dev *sdev,
57 struct scrub_page *spag, void *buffer);
58static int scrub_checksum_tree_block(struct scrub_dev *sdev,
59 struct scrub_page *spag, u64 logical,
60 void *buffer);
61static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
62static void scrub_recheck_end_io(struct bio *bio, int err);
63static void scrub_fixup_worker(struct btrfs_work *work);
64static void scrub_fixup(struct scrub_fixup *fixup);
65
66#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
67#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */
68
69struct scrub_page {
70 u64 flags; /* extent flags */
71 u64 generation;
72 u64 mirror_num;
73 int have_csum;
74 u8 csum[BTRFS_CSUM_SIZE];
75};
76
77struct scrub_bio {
78 int index;
79 struct scrub_dev *sdev;
80 struct bio *bio;
81 int err;
82 u64 logical;
83 u64 physical;
84 struct scrub_page spag[SCRUB_PAGES_PER_BIO];
85 u64 count;
86 int next_free;
87 struct btrfs_work work;
88};
89
90struct scrub_dev {
91 struct scrub_bio *bios[SCRUB_BIOS_PER_DEV];
92 struct btrfs_device *dev;
93 int first_free;
94 int curr;
95 atomic_t in_flight;
96 spinlock_t list_lock;
97 wait_queue_head_t list_wait;
98 u16 csum_size;
99 struct list_head csum_list;
100 atomic_t cancel_req;
101 /*
102 * statistics
103 */
104 struct btrfs_scrub_progress stat;
105 spinlock_t stat_lock;
106};
107
108struct scrub_fixup {
109 struct scrub_dev *sdev;
110 struct bio *bio;
111 u64 logical;
112 u64 physical;
113 struct scrub_page spag;
114 struct btrfs_work work;
115 int err;
116 int recheck;
117};
118
119static void scrub_free_csums(struct scrub_dev *sdev)
120{
121 while (!list_empty(&sdev->csum_list)) {
122 struct btrfs_ordered_sum *sum;
123 sum = list_first_entry(&sdev->csum_list,
124 struct btrfs_ordered_sum, list);
125 list_del(&sum->list);
126 kfree(sum);
127 }
128}
129
130static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
131{
132 int i;
133 int j;
134 struct page *last_page;
135
136 if (!sdev)
137 return;
138
139 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
140 struct scrub_bio *sbio = sdev->bios[i];
141 struct bio *bio;
142
143 if (!sbio)
144 break;
145
146 bio = sbio->bio;
147 if (bio) {
148 last_page = NULL;
149 for (j = 0; j < bio->bi_vcnt; ++j) {
150 if (bio->bi_io_vec[j].bv_page == last_page)
151 continue;
152 last_page = bio->bi_io_vec[j].bv_page;
153 __free_page(last_page);
154 }
155 bio_put(bio);
156 }
157 kfree(sbio);
158 }
159
160 scrub_free_csums(sdev);
161 kfree(sdev);
162}
163
164static noinline_for_stack
165struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
166{
167 struct scrub_dev *sdev;
168 int i;
169 int j;
170 int ret;
171 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
172
173 sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
174 if (!sdev)
175 goto nomem;
176 sdev->dev = dev;
177 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
178 struct bio *bio;
179 struct scrub_bio *sbio;
180
181 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
182 if (!sbio)
183 goto nomem;
184 sdev->bios[i] = sbio;
185
186 bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
187 if (!bio)
188 goto nomem;
189
190 sbio->index = i;
191 sbio->sdev = sdev;
192 sbio->bio = bio;
193 sbio->count = 0;
194 sbio->work.func = scrub_checksum;
195 bio->bi_private = sdev->bios[i];
196 bio->bi_end_io = scrub_bio_end_io;
197 bio->bi_sector = 0;
198 bio->bi_bdev = dev->bdev;
199 bio->bi_size = 0;
200
201 for (j = 0; j < SCRUB_PAGES_PER_BIO; ++j) {
202 struct page *page;
203 page = alloc_page(GFP_NOFS);
204 if (!page)
205 goto nomem;
206
207 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
208 if (!ret)
209 goto nomem;
210 }
211 WARN_ON(bio->bi_vcnt != SCRUB_PAGES_PER_BIO);
212
213 if (i != SCRUB_BIOS_PER_DEV-1)
214 sdev->bios[i]->next_free = i + 1;
215 else
216 sdev->bios[i]->next_free = -1;
217 }
218 sdev->first_free = 0;
219 sdev->curr = -1;
220 atomic_set(&sdev->in_flight, 0);
221 atomic_set(&sdev->cancel_req, 0);
222 sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
223 INIT_LIST_HEAD(&sdev->csum_list);
224
225 spin_lock_init(&sdev->list_lock);
226 spin_lock_init(&sdev->stat_lock);
227 init_waitqueue_head(&sdev->list_wait);
228 return sdev;
229
230nomem:
231 scrub_free_dev(sdev);
232 return ERR_PTR(-ENOMEM);
233}
234
235/*
236 * scrub_recheck_error gets called when either verification of the page
237 * failed or the bio failed to read, e.g. with EIO. In the latter case,
238 * recheck_error gets called for every page in the bio, even though only
239 * one may be bad
240 */
241static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
242{
243 struct scrub_dev *sdev = sbio->sdev;
244 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
245 struct bio *bio = NULL;
246 struct page *page = NULL;
247 struct scrub_fixup *fixup = NULL;
248 int ret;
249
250 /*
251 * while we're in here we do not want the transaction to commit.
252 * To prevent it, we increment scrubs_running. scrub_pause will
253 * have to wait until we're finished
254 * we can safely increment scrubs_running here, because we're
255 * in the context of the original bio which is still marked in_flight
256 */
257 atomic_inc(&fs_info->scrubs_running);
258
259 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
260 if (!fixup)
261 goto malloc_error;
262
263 fixup->logical = sbio->logical + ix * PAGE_SIZE;
264 fixup->physical = sbio->physical + ix * PAGE_SIZE;
265 fixup->spag = sbio->spag[ix];
266 fixup->sdev = sdev;
267
268 bio = bio_alloc(GFP_NOFS, 1);
269 if (!bio)
270 goto malloc_error;
271 bio->bi_private = fixup;
272 bio->bi_size = 0;
273 bio->bi_bdev = sdev->dev->bdev;
274 fixup->bio = bio;
275 fixup->recheck = 0;
276
277 page = alloc_page(GFP_NOFS);
278 if (!page)
279 goto malloc_error;
280
281 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
282 if (!ret)
283 goto malloc_error;
284
285 if (!sbio->err) {
286 /*
287 * shorter path: just a checksum error, go ahead and correct it
288 */
289 scrub_fixup_worker(&fixup->work);
290 return;
291 }
292
293 /*
294 * an I/O-error occured for one of the blocks in the bio, not
295 * necessarily for this one, so first try to read it separately
296 */
297 fixup->work.func = scrub_fixup_worker;
298 fixup->recheck = 1;
299 bio->bi_end_io = scrub_recheck_end_io;
300 bio->bi_sector = fixup->physical >> 9;
301 bio->bi_bdev = sdev->dev->bdev;
302 submit_bio(0, bio);
303
304 return;
305
306malloc_error:
307 if (bio)
308 bio_put(bio);
309 if (page)
310 __free_page(page);
311 kfree(fixup);
312 spin_lock(&sdev->stat_lock);
313 ++sdev->stat.malloc_errors;
314 spin_unlock(&sdev->stat_lock);
315 atomic_dec(&fs_info->scrubs_running);
316 wake_up(&fs_info->scrub_pause_wait);
317}
318
319static void scrub_recheck_end_io(struct bio *bio, int err)
320{
321 struct scrub_fixup *fixup = bio->bi_private;
322 struct btrfs_fs_info *fs_info = fixup->sdev->dev->dev_root->fs_info;
323
324 fixup->err = err;
325 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
326}
327
328static int scrub_fixup_check(struct scrub_fixup *fixup)
329{
330 int ret = 1;
331 struct page *page;
332 void *buffer;
333 u64 flags = fixup->spag.flags;
334
335 page = fixup->bio->bi_io_vec[0].bv_page;
336 buffer = kmap_atomic(page, KM_USER0);
337 if (flags & BTRFS_EXTENT_FLAG_DATA) {
338 ret = scrub_checksum_data(fixup->sdev,
339 &fixup->spag, buffer);
340 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
341 ret = scrub_checksum_tree_block(fixup->sdev,
342 &fixup->spag,
343 fixup->logical,
344 buffer);
345 } else {
346 WARN_ON(1);
347 }
348 kunmap_atomic(buffer, KM_USER0);
349
350 return ret;
351}
352
353static void scrub_fixup_worker(struct btrfs_work *work)
354{
355 struct scrub_fixup *fixup;
356 struct btrfs_fs_info *fs_info;
357 u64 flags;
358 int ret = 1;
359
360 fixup = container_of(work, struct scrub_fixup, work);
361 fs_info = fixup->sdev->dev->dev_root->fs_info;
362 flags = fixup->spag.flags;
363
364 if (fixup->recheck && fixup->err == 0)
365 ret = scrub_fixup_check(fixup);
366
367 if (ret || fixup->err)
368 scrub_fixup(fixup);
369
370 __free_page(fixup->bio->bi_io_vec[0].bv_page);
371 bio_put(fixup->bio);
372
373 atomic_dec(&fs_info->scrubs_running);
374 wake_up(&fs_info->scrub_pause_wait);
375
376 kfree(fixup);
377}
378
379static void scrub_fixup_end_io(struct bio *bio, int err)
380{
381 complete((struct completion *)bio->bi_private);
382}
383
384static void scrub_fixup(struct scrub_fixup *fixup)
385{
386 struct scrub_dev *sdev = fixup->sdev;
387 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
388 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
389 struct btrfs_multi_bio *multi = NULL;
390 struct bio *bio = fixup->bio;
391 u64 length;
392 int i;
393 int ret;
394 DECLARE_COMPLETION_ONSTACK(complete);
395
396 if ((fixup->spag.flags & BTRFS_EXTENT_FLAG_DATA) &&
397 (fixup->spag.have_csum == 0)) {
398 /*
399 * nodatasum, don't try to fix anything
400 * FIXME: we can do better, open the inode and trigger a
401 * writeback
402 */
403 goto uncorrectable;
404 }
405
406 length = PAGE_SIZE;
407 ret = btrfs_map_block(map_tree, REQ_WRITE, fixup->logical, &length,
408 &multi, 0);
409 if (ret || !multi || length < PAGE_SIZE) {
410 printk(KERN_ERR
411 "scrub_fixup: btrfs_map_block failed us for %llu\n",
412 (unsigned long long)fixup->logical);
413 WARN_ON(1);
414 return;
415 }
416
417 if (multi->num_stripes == 1) {
418 /* there aren't any replicas */
419 goto uncorrectable;
420 }
421
422 /*
423 * first find a good copy
424 */
425 for (i = 0; i < multi->num_stripes; ++i) {
426 if (i == fixup->spag.mirror_num)
427 continue;
428
429 bio->bi_sector = multi->stripes[i].physical >> 9;
430 bio->bi_bdev = multi->stripes[i].dev->bdev;
431 bio->bi_size = PAGE_SIZE;
432 bio->bi_next = NULL;
433 bio->bi_flags |= 1 << BIO_UPTODATE;
434 bio->bi_comp_cpu = -1;
435 bio->bi_end_io = scrub_fixup_end_io;
436 bio->bi_private = &complete;
437
438 submit_bio(0, bio);
439
440 wait_for_completion(&complete);
441
442 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
443 /* I/O-error, this is not a good copy */
444 continue;
445
446 ret = scrub_fixup_check(fixup);
447 if (ret == 0)
448 break;
449 }
450 if (i == multi->num_stripes)
451 goto uncorrectable;
452
453 /*
454 * the bio now contains good data, write it back
455 */
456 bio->bi_sector = fixup->physical >> 9;
457 bio->bi_bdev = sdev->dev->bdev;
458 bio->bi_size = PAGE_SIZE;
459 bio->bi_next = NULL;
460 bio->bi_flags |= 1 << BIO_UPTODATE;
461 bio->bi_comp_cpu = -1;
462 bio->bi_end_io = scrub_fixup_end_io;
463 bio->bi_private = &complete;
464
465 submit_bio(REQ_WRITE, bio);
466
467 wait_for_completion(&complete);
468
469 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
470 /* I/O-error, writeback failed, give up */
471 goto uncorrectable;
472
473 kfree(multi);
474 spin_lock(&sdev->stat_lock);
475 ++sdev->stat.corrected_errors;
476 spin_unlock(&sdev->stat_lock);
477
478 if (printk_ratelimit())
479 printk(KERN_ERR "btrfs: fixed up at %llu\n",
480 (unsigned long long)fixup->logical);
481 return;
482
483uncorrectable:
484 kfree(multi);
485 spin_lock(&sdev->stat_lock);
486 ++sdev->stat.uncorrectable_errors;
487 spin_unlock(&sdev->stat_lock);
488
489 if (printk_ratelimit())
490 printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
491 (unsigned long long)fixup->logical);
492}
493
494static void scrub_bio_end_io(struct bio *bio, int err)
495{
496 struct scrub_bio *sbio = bio->bi_private;
497 struct scrub_dev *sdev = sbio->sdev;
498 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
499
500 sbio->err = err;
501
502 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
503}
504
505static void scrub_checksum(struct btrfs_work *work)
506{
507 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
508 struct scrub_dev *sdev = sbio->sdev;
509 struct page *page;
510 void *buffer;
511 int i;
512 u64 flags;
513 u64 logical;
514 int ret;
515
516 if (sbio->err) {
517 struct bio *bio;
518 struct bio *old_bio;
519
520 for (i = 0; i < sbio->count; ++i)
521 scrub_recheck_error(sbio, i);
522 spin_lock(&sdev->stat_lock);
523 ++sdev->stat.read_errors;
524 spin_unlock(&sdev->stat_lock);
525
526 /*
527 * FIXME: allocate a new bio after a media error. I haven't
528 * figured out how to reuse this one
529 */
530 old_bio = sbio->bio;
531 bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
532 if (!bio) {
533 /*
534 * alloc failed. cancel the scrub and don't requeue
535 * this sbio
536 */
537 printk(KERN_ERR "btrfs scrub: allocation failure, "
538 "cancelling scrub\n");
539 atomic_inc(&sdev->dev->dev_root->fs_info->
540 scrub_cancel_req);
541 goto out_no_enqueue;
542 }
543 sbio->bio = bio;
544 bio->bi_private = sbio;
545 bio->bi_end_io = scrub_bio_end_io;
546 bio->bi_sector = 0;
547 bio->bi_bdev = sbio->sdev->dev->bdev;
548 bio->bi_size = 0;
549 for (i = 0; i < SCRUB_PAGES_PER_BIO; ++i) {
550 struct page *page;
551 page = old_bio->bi_io_vec[i].bv_page;
552 bio_add_page(bio, page, PAGE_SIZE, 0);
553 }
554 bio_put(old_bio);
555 goto out;
556 }
557 for (i = 0; i < sbio->count; ++i) {
558 page = sbio->bio->bi_io_vec[i].bv_page;
559 buffer = kmap_atomic(page, KM_USER0);
560 flags = sbio->spag[i].flags;
561 logical = sbio->logical + i * PAGE_SIZE;
562 ret = 0;
563 if (flags & BTRFS_EXTENT_FLAG_DATA) {
564 ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
565 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
566 ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
567 logical, buffer);
568 } else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
569 BUG_ON(i);
570 (void)scrub_checksum_super(sbio, buffer);
571 } else {
572 WARN_ON(1);
573 }
574 kunmap_atomic(buffer, KM_USER0);
575 if (ret)
576 scrub_recheck_error(sbio, i);
577 }
578
579out:
580 spin_lock(&sdev->list_lock);
581 sbio->next_free = sdev->first_free;
582 sdev->first_free = sbio->index;
583 spin_unlock(&sdev->list_lock);
584out_no_enqueue:
585 atomic_dec(&sdev->in_flight);
586 wake_up(&sdev->list_wait);
587}
588
589static int scrub_checksum_data(struct scrub_dev *sdev,
590 struct scrub_page *spag, void *buffer)
591{
592 u8 csum[BTRFS_CSUM_SIZE];
593 u32 crc = ~(u32)0;
594 int fail = 0;
595 struct btrfs_root *root = sdev->dev->dev_root;
596
597 if (!spag->have_csum)
598 return 0;
599
600 crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
601 btrfs_csum_final(crc, csum);
602 if (memcmp(csum, spag->csum, sdev->csum_size))
603 fail = 1;
604
605 spin_lock(&sdev->stat_lock);
606 ++sdev->stat.data_extents_scrubbed;
607 sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
608 if (fail)
609 ++sdev->stat.csum_errors;
610 spin_unlock(&sdev->stat_lock);
611
612 return fail;
613}
614
615static int scrub_checksum_tree_block(struct scrub_dev *sdev,
616 struct scrub_page *spag, u64 logical,
617 void *buffer)
618{
619 struct btrfs_header *h;
620 struct btrfs_root *root = sdev->dev->dev_root;
621 struct btrfs_fs_info *fs_info = root->fs_info;
622 u8 csum[BTRFS_CSUM_SIZE];
623 u32 crc = ~(u32)0;
624 int fail = 0;
625 int crc_fail = 0;
626
627 /*
628 * we don't use the getter functions here, as we
629 * a) don't have an extent buffer and
630 * b) the page is already kmapped
631 */
632 h = (struct btrfs_header *)buffer;
633
634 if (logical != le64_to_cpu(h->bytenr))
635 ++fail;
636
637 if (spag->generation != le64_to_cpu(h->generation))
638 ++fail;
639
640 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
641 ++fail;
642
643 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
644 BTRFS_UUID_SIZE))
645 ++fail;
646
647 crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
648 PAGE_SIZE - BTRFS_CSUM_SIZE);
649 btrfs_csum_final(crc, csum);
650 if (memcmp(csum, h->csum, sdev->csum_size))
651 ++crc_fail;
652
653 spin_lock(&sdev->stat_lock);
654 ++sdev->stat.tree_extents_scrubbed;
655 sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
656 if (crc_fail)
657 ++sdev->stat.csum_errors;
658 if (fail)
659 ++sdev->stat.verify_errors;
660 spin_unlock(&sdev->stat_lock);
661
662 return fail || crc_fail;
663}
664
665static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
666{
667 struct btrfs_super_block *s;
668 u64 logical;
669 struct scrub_dev *sdev = sbio->sdev;
670 struct btrfs_root *root = sdev->dev->dev_root;
671 struct btrfs_fs_info *fs_info = root->fs_info;
672 u8 csum[BTRFS_CSUM_SIZE];
673 u32 crc = ~(u32)0;
674 int fail = 0;
675
676 s = (struct btrfs_super_block *)buffer;
677 logical = sbio->logical;
678
679 if (logical != le64_to_cpu(s->bytenr))
680 ++fail;
681
682 if (sbio->spag[0].generation != le64_to_cpu(s->generation))
683 ++fail;
684
685 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
686 ++fail;
687
688 crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
689 PAGE_SIZE - BTRFS_CSUM_SIZE);
690 btrfs_csum_final(crc, csum);
691 if (memcmp(csum, s->csum, sbio->sdev->csum_size))
692 ++fail;
693
694 if (fail) {
695 /*
696 * if we find an error in a super block, we just report it.
697 * They will get written with the next transaction commit
698 * anyway
699 */
700 spin_lock(&sdev->stat_lock);
701 ++sdev->stat.super_errors;
702 spin_unlock(&sdev->stat_lock);
703 }
704
705 return fail;
706}
707
708static int scrub_submit(struct scrub_dev *sdev)
709{
710 struct scrub_bio *sbio;
711
712 if (sdev->curr == -1)
713 return 0;
714
715 sbio = sdev->bios[sdev->curr];
716
717 sbio->bio->bi_sector = sbio->physical >> 9;
718 sbio->bio->bi_size = sbio->count * PAGE_SIZE;
719 sbio->bio->bi_next = NULL;
720 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
721 sbio->bio->bi_comp_cpu = -1;
722 sbio->bio->bi_bdev = sdev->dev->bdev;
723 sbio->err = 0;
724 sdev->curr = -1;
725 atomic_inc(&sdev->in_flight);
726
727 submit_bio(0, sbio->bio);
728
729 return 0;
730}
731
732static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
733 u64 physical, u64 flags, u64 gen, u64 mirror_num,
734 u8 *csum, int force)
735{
736 struct scrub_bio *sbio;
737
738again:
739 /*
740 * grab a fresh bio or wait for one to become available
741 */
742 while (sdev->curr == -1) {
743 spin_lock(&sdev->list_lock);
744 sdev->curr = sdev->first_free;
745 if (sdev->curr != -1) {
746 sdev->first_free = sdev->bios[sdev->curr]->next_free;
747 sdev->bios[sdev->curr]->next_free = -1;
748 sdev->bios[sdev->curr]->count = 0;
749 spin_unlock(&sdev->list_lock);
750 } else {
751 spin_unlock(&sdev->list_lock);
752 wait_event(sdev->list_wait, sdev->first_free != -1);
753 }
754 }
755 sbio = sdev->bios[sdev->curr];
756 if (sbio->count == 0) {
757 sbio->physical = physical;
758 sbio->logical = logical;
759 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical) {
760 scrub_submit(sdev);
761 goto again;
762 }
763 sbio->spag[sbio->count].flags = flags;
764 sbio->spag[sbio->count].generation = gen;
765 sbio->spag[sbio->count].have_csum = 0;
766 sbio->spag[sbio->count].mirror_num = mirror_num;
767 if (csum) {
768 sbio->spag[sbio->count].have_csum = 1;
769 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
770 }
771 ++sbio->count;
772 if (sbio->count == SCRUB_PAGES_PER_BIO || force)
773 scrub_submit(sdev);
774
775 return 0;
776}
777
778static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
779 u8 *csum)
780{
781 struct btrfs_ordered_sum *sum = NULL;
782 int ret = 0;
783 unsigned long i;
784 unsigned long num_sectors;
785 u32 sectorsize = sdev->dev->dev_root->sectorsize;
786
787 while (!list_empty(&sdev->csum_list)) {
788 sum = list_first_entry(&sdev->csum_list,
789 struct btrfs_ordered_sum, list);
790 if (sum->bytenr > logical)
791 return 0;
792 if (sum->bytenr + sum->len > logical)
793 break;
794
795 ++sdev->stat.csum_discards;
796 list_del(&sum->list);
797 kfree(sum);
798 sum = NULL;
799 }
800 if (!sum)
801 return 0;
802
803 num_sectors = sum->len / sectorsize;
804 for (i = 0; i < num_sectors; ++i) {
805 if (sum->sums[i].bytenr == logical) {
806 memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
807 ret = 1;
808 break;
809 }
810 }
811 if (ret && i == num_sectors - 1) {
812 list_del(&sum->list);
813 kfree(sum);
814 }
815 return ret;
816}
817
818/* scrub extent tries to collect up to 64 kB for each bio */
819static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
820 u64 physical, u64 flags, u64 gen, u64 mirror_num)
821{
822 int ret;
823 u8 csum[BTRFS_CSUM_SIZE];
824
825 while (len) {
826 u64 l = min_t(u64, len, PAGE_SIZE);
827 int have_csum = 0;
828
829 if (flags & BTRFS_EXTENT_FLAG_DATA) {
830 /* push csums to sbio */
831 have_csum = scrub_find_csum(sdev, logical, l, csum);
832 if (have_csum == 0)
833 ++sdev->stat.no_csum;
834 }
835 ret = scrub_page(sdev, logical, l, physical, flags, gen,
836 mirror_num, have_csum ? csum : NULL, 0);
837 if (ret)
838 return ret;
839 len -= l;
840 logical += l;
841 physical += l;
842 }
843 return 0;
844}
845
846static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
847 struct map_lookup *map, int num, u64 base, u64 length)
848{
849 struct btrfs_path *path;
850 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
851 struct btrfs_root *root = fs_info->extent_root;
852 struct btrfs_root *csum_root = fs_info->csum_root;
853 struct btrfs_extent_item *extent;
854 u64 flags;
855 int ret;
856 int slot;
857 int i;
858 u64 nstripes;
859 int start_stripe;
860 struct extent_buffer *l;
861 struct btrfs_key key;
862 u64 physical;
863 u64 logical;
864 u64 generation;
865 u64 mirror_num;
866
867 u64 increment = map->stripe_len;
868 u64 offset;
869
870 nstripes = length;
871 offset = 0;
872 do_div(nstripes, map->stripe_len);
873 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
874 offset = map->stripe_len * num;
875 increment = map->stripe_len * map->num_stripes;
876 mirror_num = 0;
877 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
878 int factor = map->num_stripes / map->sub_stripes;
879 offset = map->stripe_len * (num / map->sub_stripes);
880 increment = map->stripe_len * factor;
881 mirror_num = num % map->sub_stripes;
882 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
883 increment = map->stripe_len;
884 mirror_num = num % map->num_stripes;
885 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
886 increment = map->stripe_len;
887 mirror_num = num % map->num_stripes;
888 } else {
889 increment = map->stripe_len;
890 mirror_num = 0;
891 }
892
893 path = btrfs_alloc_path();
894 if (!path)
895 return -ENOMEM;
896
897 path->reada = 2;
898 path->search_commit_root = 1;
899 path->skip_locking = 1;
900
901 /*
902 * find all extents for each stripe and just read them to get
903 * them into the page cache
904 * FIXME: we can do better. build a more intelligent prefetching
905 */
906 logical = base + offset;
907 physical = map->stripes[num].physical;
908 ret = 0;
909 for (i = 0; i < nstripes; ++i) {
910 key.objectid = logical;
911 key.type = BTRFS_EXTENT_ITEM_KEY;
912 key.offset = (u64)0;
913
914 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
915 if (ret < 0)
916 goto out;
917
918 l = path->nodes[0];
919 slot = path->slots[0];
920 btrfs_item_key_to_cpu(l, &key, slot);
921 if (key.objectid != logical) {
922 ret = btrfs_previous_item(root, path, 0,
923 BTRFS_EXTENT_ITEM_KEY);
924 if (ret < 0)
925 goto out;
926 }
927
928 while (1) {
929 l = path->nodes[0];
930 slot = path->slots[0];
931 if (slot >= btrfs_header_nritems(l)) {
932 ret = btrfs_next_leaf(root, path);
933 if (ret == 0)
934 continue;
935 if (ret < 0)
936 goto out;
937
938 break;
939 }
940 btrfs_item_key_to_cpu(l, &key, slot);
941
942 if (key.objectid >= logical + map->stripe_len)
943 break;
944
945 path->slots[0]++;
946 }
947 btrfs_release_path(root, path);
948 logical += increment;
949 physical += map->stripe_len;
950 cond_resched();
951 }
952
953 /*
954 * collect all data csums for the stripe to avoid seeking during
955 * the scrub. This might currently (crc32) end up to be about 1MB
956 */
957 start_stripe = 0;
958again:
959 logical = base + offset + start_stripe * increment;
960 for (i = start_stripe; i < nstripes; ++i) {
961 ret = btrfs_lookup_csums_range(csum_root, logical,
962 logical + map->stripe_len - 1,
963 &sdev->csum_list, 1);
964 if (ret)
965 goto out;
966
967 logical += increment;
968 cond_resched();
969 }
970 /*
971 * now find all extents for each stripe and scrub them
972 */
973 logical = base + offset + start_stripe * increment;
974 physical = map->stripes[num].physical + start_stripe * map->stripe_len;
975 ret = 0;
976 for (i = start_stripe; i < nstripes; ++i) {
977 /*
978 * canceled?
979 */
980 if (atomic_read(&fs_info->scrub_cancel_req) ||
981 atomic_read(&sdev->cancel_req)) {
982 ret = -ECANCELED;
983 goto out;
984 }
985 /*
986 * check to see if we have to pause
987 */
988 if (atomic_read(&fs_info->scrub_pause_req)) {
989 /* push queued extents */
990 scrub_submit(sdev);
991 wait_event(sdev->list_wait,
992 atomic_read(&sdev->in_flight) == 0);
993 atomic_inc(&fs_info->scrubs_paused);
994 wake_up(&fs_info->scrub_pause_wait);
995 mutex_lock(&fs_info->scrub_lock);
996 while (atomic_read(&fs_info->scrub_pause_req)) {
997 mutex_unlock(&fs_info->scrub_lock);
998 wait_event(fs_info->scrub_pause_wait,
999 atomic_read(&fs_info->scrub_pause_req) == 0);
1000 mutex_lock(&fs_info->scrub_lock);
1001 }
1002 atomic_dec(&fs_info->scrubs_paused);
1003 mutex_unlock(&fs_info->scrub_lock);
1004 wake_up(&fs_info->scrub_pause_wait);
1005 scrub_free_csums(sdev);
1006 start_stripe = i;
1007 goto again;
1008 }
1009
1010 key.objectid = logical;
1011 key.type = BTRFS_EXTENT_ITEM_KEY;
1012 key.offset = (u64)0;
1013
1014 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1015 if (ret < 0)
1016 goto out;
1017
1018 l = path->nodes[0];
1019 slot = path->slots[0];
1020 btrfs_item_key_to_cpu(l, &key, slot);
1021 if (key.objectid != logical) {
1022 ret = btrfs_previous_item(root, path, 0,
1023 BTRFS_EXTENT_ITEM_KEY);
1024 if (ret < 0)
1025 goto out;
1026 }
1027
1028 while (1) {
1029 l = path->nodes[0];
1030 slot = path->slots[0];
1031 if (slot >= btrfs_header_nritems(l)) {
1032 ret = btrfs_next_leaf(root, path);
1033 if (ret == 0)
1034 continue;
1035 if (ret < 0)
1036 goto out;
1037
1038 break;
1039 }
1040 btrfs_item_key_to_cpu(l, &key, slot);
1041
1042 if (key.objectid + key.offset <= logical)
1043 goto next;
1044
1045 if (key.objectid >= logical + map->stripe_len)
1046 break;
1047
1048 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
1049 goto next;
1050
1051 extent = btrfs_item_ptr(l, slot,
1052 struct btrfs_extent_item);
1053 flags = btrfs_extent_flags(l, extent);
1054 generation = btrfs_extent_generation(l, extent);
1055
1056 if (key.objectid < logical &&
1057 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
1058 printk(KERN_ERR
1059 "btrfs scrub: tree block %llu spanning "
1060 "stripes, ignored. logical=%llu\n",
1061 (unsigned long long)key.objectid,
1062 (unsigned long long)logical);
1063 goto next;
1064 }
1065
1066 /*
1067 * trim extent to this stripe
1068 */
1069 if (key.objectid < logical) {
1070 key.offset -= logical - key.objectid;
1071 key.objectid = logical;
1072 }
1073 if (key.objectid + key.offset >
1074 logical + map->stripe_len) {
1075 key.offset = logical + map->stripe_len -
1076 key.objectid;
1077 }
1078
1079 ret = scrub_extent(sdev, key.objectid, key.offset,
1080 key.objectid - logical + physical,
1081 flags, generation, mirror_num);
1082 if (ret)
1083 goto out;
1084
1085next:
1086 path->slots[0]++;
1087 }
1088 btrfs_release_path(root, path);
1089 logical += increment;
1090 physical += map->stripe_len;
1091 spin_lock(&sdev->stat_lock);
1092 sdev->stat.last_physical = physical;
1093 spin_unlock(&sdev->stat_lock);
1094 }
1095 /* push queued extents */
1096 scrub_submit(sdev);
1097
1098out:
1099 btrfs_free_path(path);
1100 return ret < 0 ? ret : 0;
1101}
1102
1103static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
1104 u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length)
1105{
1106 struct btrfs_mapping_tree *map_tree =
1107 &sdev->dev->dev_root->fs_info->mapping_tree;
1108 struct map_lookup *map;
1109 struct extent_map *em;
1110 int i;
1111 int ret = -EINVAL;
1112
1113 read_lock(&map_tree->map_tree.lock);
1114 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
1115 read_unlock(&map_tree->map_tree.lock);
1116
1117 if (!em)
1118 return -EINVAL;
1119
1120 map = (struct map_lookup *)em->bdev;
1121 if (em->start != chunk_offset)
1122 goto out;
1123
1124 if (em->len < length)
1125 goto out;
1126
1127 for (i = 0; i < map->num_stripes; ++i) {
1128 if (map->stripes[i].dev == sdev->dev) {
1129 ret = scrub_stripe(sdev, map, i, chunk_offset, length);
1130 if (ret)
1131 goto out;
1132 }
1133 }
1134out:
1135 free_extent_map(em);
1136
1137 return ret;
1138}
1139
1140static noinline_for_stack
1141int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
1142{
1143 struct btrfs_dev_extent *dev_extent = NULL;
1144 struct btrfs_path *path;
1145 struct btrfs_root *root = sdev->dev->dev_root;
1146 struct btrfs_fs_info *fs_info = root->fs_info;
1147 u64 length;
1148 u64 chunk_tree;
1149 u64 chunk_objectid;
1150 u64 chunk_offset;
1151 int ret;
1152 int slot;
1153 struct extent_buffer *l;
1154 struct btrfs_key key;
1155 struct btrfs_key found_key;
1156 struct btrfs_block_group_cache *cache;
1157
1158 path = btrfs_alloc_path();
1159 if (!path)
1160 return -ENOMEM;
1161
1162 path->reada = 2;
1163 path->search_commit_root = 1;
1164 path->skip_locking = 1;
1165
1166 key.objectid = sdev->dev->devid;
1167 key.offset = 0ull;
1168 key.type = BTRFS_DEV_EXTENT_KEY;
1169
1170
1171 while (1) {
1172 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1173 if (ret < 0)
1174 goto out;
1175 ret = 0;
1176
1177 l = path->nodes[0];
1178 slot = path->slots[0];
1179
1180 btrfs_item_key_to_cpu(l, &found_key, slot);
1181
1182 if (found_key.objectid != sdev->dev->devid)
1183 break;
1184
1185 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
1186 break;
1187
1188 if (found_key.offset >= end)
1189 break;
1190
1191 if (found_key.offset < key.offset)
1192 break;
1193
1194 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1195 length = btrfs_dev_extent_length(l, dev_extent);
1196
1197 if (found_key.offset + length <= start) {
1198 key.offset = found_key.offset + length;
1199 btrfs_release_path(root, path);
1200 continue;
1201 }
1202
1203 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1204 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1205 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1206
1207 /*
1208 * get a reference on the corresponding block group to prevent
1209 * the chunk from going away while we scrub it
1210 */
1211 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
1212 if (!cache) {
1213 ret = -ENOENT;
1214 goto out;
1215 }
1216 ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
1217 chunk_offset, length);
1218 btrfs_put_block_group(cache);
1219 if (ret)
1220 break;
1221
1222 key.offset = found_key.offset + length;
1223 btrfs_release_path(root, path);
1224 }
1225
1226out:
1227 btrfs_free_path(path);
1228 return ret;
1229}
1230
1231static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
1232{
1233 int i;
1234 u64 bytenr;
1235 u64 gen;
1236 int ret;
1237 struct btrfs_device *device = sdev->dev;
1238 struct btrfs_root *root = device->dev_root;
1239
1240 gen = root->fs_info->last_trans_committed;
1241
1242 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1243 bytenr = btrfs_sb_offset(i);
1244 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
1245 break;
1246
1247 ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
1248 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
1249 if (ret)
1250 return ret;
1251 }
1252 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1253
1254 return 0;
1255}
1256
1257/*
1258 * get a reference count on fs_info->scrub_workers. start worker if necessary
1259 */
1260static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
1261{
1262 struct btrfs_fs_info *fs_info = root->fs_info;
1263
1264 mutex_lock(&fs_info->scrub_lock);
1265 if (fs_info->scrub_workers_refcnt == 0)
1266 btrfs_start_workers(&fs_info->scrub_workers, 1);
1267 ++fs_info->scrub_workers_refcnt;
1268 mutex_unlock(&fs_info->scrub_lock);
1269
1270 return 0;
1271}
1272
1273static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
1274{
1275 struct btrfs_fs_info *fs_info = root->fs_info;
1276
1277 mutex_lock(&fs_info->scrub_lock);
1278 if (--fs_info->scrub_workers_refcnt == 0)
1279 btrfs_stop_workers(&fs_info->scrub_workers);
1280 WARN_ON(fs_info->scrub_workers_refcnt < 0);
1281 mutex_unlock(&fs_info->scrub_lock);
1282}
1283
1284
1285int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1286 struct btrfs_scrub_progress *progress)
1287{
1288 struct scrub_dev *sdev;
1289 struct btrfs_fs_info *fs_info = root->fs_info;
1290 int ret;
1291 struct btrfs_device *dev;
1292
1293 if (root->fs_info->closing)
1294 return -EINVAL;
1295
1296 /*
1297 * check some assumptions
1298 */
1299 if (root->sectorsize != PAGE_SIZE ||
1300 root->sectorsize != root->leafsize ||
1301 root->sectorsize != root->nodesize) {
1302 printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
1303 return -EINVAL;
1304 }
1305
1306 ret = scrub_workers_get(root);
1307 if (ret)
1308 return ret;
1309
1310 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1311 dev = btrfs_find_device(root, devid, NULL, NULL);
1312 if (!dev || dev->missing) {
1313 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1314 scrub_workers_put(root);
1315 return -ENODEV;
1316 }
1317 mutex_lock(&fs_info->scrub_lock);
1318
1319 if (!dev->in_fs_metadata) {
1320 mutex_unlock(&fs_info->scrub_lock);
1321 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1322 scrub_workers_put(root);
1323 return -ENODEV;
1324 }
1325
1326 if (dev->scrub_device) {
1327 mutex_unlock(&fs_info->scrub_lock);
1328 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1329 scrub_workers_put(root);
1330 return -EINPROGRESS;
1331 }
1332 sdev = scrub_setup_dev(dev);
1333 if (IS_ERR(sdev)) {
1334 mutex_unlock(&fs_info->scrub_lock);
1335 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1336 scrub_workers_put(root);
1337 return PTR_ERR(sdev);
1338 }
1339 dev->scrub_device = sdev;
1340
1341 atomic_inc(&fs_info->scrubs_running);
1342 mutex_unlock(&fs_info->scrub_lock);
1343 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1344
1345 down_read(&fs_info->scrub_super_lock);
1346 ret = scrub_supers(sdev);
1347 up_read(&fs_info->scrub_super_lock);
1348
1349 if (!ret)
1350 ret = scrub_enumerate_chunks(sdev, start, end);
1351
1352 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1353
1354 atomic_dec(&fs_info->scrubs_running);
1355 wake_up(&fs_info->scrub_pause_wait);
1356
1357 if (progress)
1358 memcpy(progress, &sdev->stat, sizeof(*progress));
1359
1360 mutex_lock(&fs_info->scrub_lock);
1361 dev->scrub_device = NULL;
1362 mutex_unlock(&fs_info->scrub_lock);
1363
1364 scrub_free_dev(sdev);
1365 scrub_workers_put(root);
1366
1367 return ret;
1368}
1369
1370int btrfs_scrub_pause(struct btrfs_root *root)
1371{
1372 struct btrfs_fs_info *fs_info = root->fs_info;
1373
1374 mutex_lock(&fs_info->scrub_lock);
1375 atomic_inc(&fs_info->scrub_pause_req);
1376 while (atomic_read(&fs_info->scrubs_paused) !=
1377 atomic_read(&fs_info->scrubs_running)) {
1378 mutex_unlock(&fs_info->scrub_lock);
1379 wait_event(fs_info->scrub_pause_wait,
1380 atomic_read(&fs_info->scrubs_paused) ==
1381 atomic_read(&fs_info->scrubs_running));
1382 mutex_lock(&fs_info->scrub_lock);
1383 }
1384 mutex_unlock(&fs_info->scrub_lock);
1385
1386 return 0;
1387}
1388
1389int btrfs_scrub_continue(struct btrfs_root *root)
1390{
1391 struct btrfs_fs_info *fs_info = root->fs_info;
1392
1393 atomic_dec(&fs_info->scrub_pause_req);
1394 wake_up(&fs_info->scrub_pause_wait);
1395 return 0;
1396}
1397
1398int btrfs_scrub_pause_super(struct btrfs_root *root)
1399{
1400 down_write(&root->fs_info->scrub_super_lock);
1401 return 0;
1402}
1403
1404int btrfs_scrub_continue_super(struct btrfs_root *root)
1405{
1406 up_write(&root->fs_info->scrub_super_lock);
1407 return 0;
1408}
1409
1410int btrfs_scrub_cancel(struct btrfs_root *root)
1411{
1412 struct btrfs_fs_info *fs_info = root->fs_info;
1413
1414 mutex_lock(&fs_info->scrub_lock);
1415 if (!atomic_read(&fs_info->scrubs_running)) {
1416 mutex_unlock(&fs_info->scrub_lock);
1417 return -ENOTCONN;
1418 }
1419
1420 atomic_inc(&fs_info->scrub_cancel_req);
1421 while (atomic_read(&fs_info->scrubs_running)) {
1422 mutex_unlock(&fs_info->scrub_lock);
1423 wait_event(fs_info->scrub_pause_wait,
1424 atomic_read(&fs_info->scrubs_running) == 0);
1425 mutex_lock(&fs_info->scrub_lock);
1426 }
1427 atomic_dec(&fs_info->scrub_cancel_req);
1428 mutex_unlock(&fs_info->scrub_lock);
1429
1430 return 0;
1431}
1432
1433int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
1434{
1435 struct btrfs_fs_info *fs_info = root->fs_info;
1436 struct scrub_dev *sdev;
1437
1438 mutex_lock(&fs_info->scrub_lock);
1439 sdev = dev->scrub_device;
1440 if (!sdev) {
1441 mutex_unlock(&fs_info->scrub_lock);
1442 return -ENOTCONN;
1443 }
1444 atomic_inc(&sdev->cancel_req);
1445 while (dev->scrub_device) {
1446 mutex_unlock(&fs_info->scrub_lock);
1447 wait_event(fs_info->scrub_pause_wait,
1448 dev->scrub_device == NULL);
1449 mutex_lock(&fs_info->scrub_lock);
1450 }
1451 mutex_unlock(&fs_info->scrub_lock);
1452
1453 return 0;
1454}
1455int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
1456{
1457 struct btrfs_fs_info *fs_info = root->fs_info;
1458 struct btrfs_device *dev;
1459 int ret;
1460
1461 /*
1462 * we have to hold the device_list_mutex here so the device
1463 * does not go away in cancel_dev. FIXME: find a better solution
1464 */
1465 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1466 dev = btrfs_find_device(root, devid, NULL, NULL);
1467 if (!dev) {
1468 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1469 return -ENODEV;
1470 }
1471 ret = btrfs_scrub_cancel_dev(root, dev);
1472 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1473
1474 return ret;
1475}
1476
1477int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
1478 struct btrfs_scrub_progress *progress)
1479{
1480 struct btrfs_device *dev;
1481 struct scrub_dev *sdev = NULL;
1482
1483 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1484 dev = btrfs_find_device(root, devid, NULL, NULL);
1485 if (dev)
1486 sdev = dev->scrub_device;
1487 if (sdev)
1488 memcpy(progress, &sdev->stat, sizeof(*progress));
1489 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1490
1491 return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
1492}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c571734d5e5a..37c2302a08d4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1321,6 +1321,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1321 1321
1322 WARN_ON(cur_trans != trans->transaction); 1322 WARN_ON(cur_trans != trans->transaction);
1323 1323
1324 btrfs_scrub_pause(root);
1324 /* btrfs_commit_tree_roots is responsible for getting the 1325 /* btrfs_commit_tree_roots is responsible for getting the
1325 * various roots consistent with each other. Every pointer 1326 * various roots consistent with each other. Every pointer
1326 * in the tree of tree roots has to point to the most up to date 1327 * in the tree of tree roots has to point to the most up to date
@@ -1405,6 +1406,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1405 1406
1406 mutex_unlock(&root->fs_info->trans_mutex); 1407 mutex_unlock(&root->fs_info->trans_mutex);
1407 1408
1409 btrfs_scrub_continue(root);
1410
1408 if (current->journal_info == trans) 1411 if (current->journal_info == trans)
1409 current->journal_info = NULL; 1412 current->journal_info = NULL;
1410 1413
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f997ec0c1ba4..f1a0726da5f5 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -614,7 +614,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
614 614
615 ret = btrfs_lookup_csums_range(root->log_root, 615 ret = btrfs_lookup_csums_range(root->log_root,
616 csum_start, csum_end - 1, 616 csum_start, csum_end - 1,
617 &ordered_sums); 617 &ordered_sums, 0);
618 BUG_ON(ret); 618 BUG_ON(ret);
619 while (!list_empty(&ordered_sums)) { 619 while (!list_empty(&ordered_sums)) {
620 struct btrfs_ordered_sum *sums; 620 struct btrfs_ordered_sum *sums;
@@ -2093,7 +2093,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2093 * the running transaction open, so a full commit can't hop 2093 * the running transaction open, so a full commit can't hop
2094 * in and cause problems either. 2094 * in and cause problems either.
2095 */ 2095 */
2096 btrfs_scrub_pause_super(root);
2096 write_ctree_super(trans, root->fs_info->tree_root, 1); 2097 write_ctree_super(trans, root->fs_info->tree_root, 1);
2098 btrfs_scrub_continue_super(root);
2097 ret = 0; 2099 ret = 0;
2098 2100
2099 mutex_lock(&root->log_mutex); 2101 mutex_lock(&root->log_mutex);
@@ -2689,7 +2691,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2689 ret = btrfs_lookup_csums_range( 2691 ret = btrfs_lookup_csums_range(
2690 log->fs_info->csum_root, 2692 log->fs_info->csum_root,
2691 ds + cs, ds + cs + cl - 1, 2693 ds + cs, ds + cs + cl - 1,
2692 &ordered_sums); 2694 &ordered_sums, 0);
2693 BUG_ON(ret); 2695 BUG_ON(ret);
2694 } 2696 }
2695 } 2697 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8b9fb8c7683d..89ca8f110b6e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -38,9 +38,6 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
38 struct btrfs_device *device); 38 struct btrfs_device *device);
39static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 39static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
40 40
41#define map_lookup_size(n) (sizeof(struct map_lookup) + \
42 (sizeof(struct btrfs_bio_stripe) * (n)))
43
44static DEFINE_MUTEX(uuid_mutex); 41static DEFINE_MUTEX(uuid_mutex);
45static LIST_HEAD(fs_uuids); 42static LIST_HEAD(fs_uuids);
46 43
@@ -1334,6 +1331,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1334 goto error_undo; 1331 goto error_undo;
1335 1332
1336 device->in_fs_metadata = 0; 1333 device->in_fs_metadata = 0;
1334 btrfs_scrub_cancel_dev(root, device);
1337 1335
1338 /* 1336 /*
1339 * the device list mutex makes sure that we don't change 1337 * the device list mutex makes sure that we don't change
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index cc2eadaf7a27..f7c20123a1fe 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -85,6 +85,9 @@ struct btrfs_device {
85 /* physical drive uuid (or lvm uuid) */ 85 /* physical drive uuid (or lvm uuid) */
86 u8 uuid[BTRFS_UUID_SIZE]; 86 u8 uuid[BTRFS_UUID_SIZE];
87 87
88 /* per-device scrub information */
89 struct scrub_dev *scrub_device;
90
88 struct btrfs_work work; 91 struct btrfs_work work;
89}; 92};
90 93
@@ -157,6 +160,9 @@ struct map_lookup {
157 struct btrfs_bio_stripe stripes[]; 160 struct btrfs_bio_stripe stripes[];
158}; 161};
159 162
163#define map_lookup_size(n) (sizeof(struct map_lookup) + \
164 (sizeof(struct btrfs_bio_stripe) * (n)))
165
160/* Used to sort the devices by max_avail(descending sort) */ 166/* Used to sort the devices by max_avail(descending sort) */
161int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2); 167int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
162 168