aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/file-item.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-12-08 16:58:54 -0500
committerChris Mason <chris.mason@oracle.com>2008-12-08 16:58:54 -0500
commitd20f7043fa65659136c1a7c3c456eeeb5c6f431f (patch)
tree05d1031cadec6d440a97221e3a32adb504a51699 /fs/btrfs/file-item.c
parentc99e905c945c462085c6d64646dc5af0c0a16815 (diff)
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have been stored in the subvolume trees, indexed by the inode that is referencing the data block. This means that when we read the inode, we've probably read in at least some checksums as well. But, this has a few problems: * The checksums are indexed by logical offset in the file. When compression is on, this means we have to do the expensive checksumming on the uncompressed data. It would be faster if we could checksum the compressed data instead. * If we implement encryption, we'll be checksumming the plain text and storing that on disk. This is significantly less secure. * For either compression or encryption, we have to get the plain text back before we can verify the checksum as correct. This makes the raid layer balancing and extent moving much more expensive. * It makes the front end caching code more complex, as we have touch the subvolume and inodes as we cache extents. * There is potentitally one copy of the checksum in each subvolume referencing an extent. The solution used here is to store the extent checksums in a dedicated tree. This allows us to index the checksums by phyiscal extent start and length. It means: * The checksum is against the data stored on disk, after any compression or encryption is done. * The checksum is stored in a central location, and can be verified without following back references, or reading inodes. This makes compression significantly faster by reducing the amount of data that needs to be checksummed. It will also allow much faster raid management code in general. The checksums are indexed by a key with a fixed objectid (a magic value in ctree.h) and offset set to the starting byte of the extent. This allows us to copy the checksum items into the fsync log tree directly (or any other tree), without having to invent a second format for them. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/file-item.c')
-rw-r--r--fs/btrfs/file-item.c185
1 files changed, 69 insertions, 116 deletions
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 234ed441736c..a3ad2ce00116 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -74,8 +74,7 @@ out:
74struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, 74struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
75 struct btrfs_root *root, 75 struct btrfs_root *root,
76 struct btrfs_path *path, 76 struct btrfs_path *path,
77 u64 objectid, u64 offset, 77 u64 bytenr, int cow)
78 int cow)
79{ 78{
80 int ret; 79 int ret;
81 struct btrfs_key file_key; 80 struct btrfs_key file_key;
@@ -87,9 +86,9 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
87 btrfs_super_csum_size(&root->fs_info->super_copy); 86 btrfs_super_csum_size(&root->fs_info->super_copy);
88 int csums_in_item; 87 int csums_in_item;
89 88
90 file_key.objectid = objectid; 89 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
91 file_key.offset = offset; 90 file_key.offset = bytenr;
92 btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY); 91 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
93 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); 92 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
94 if (ret < 0) 93 if (ret < 0)
95 goto fail; 94 goto fail;
@@ -100,11 +99,10 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
100 goto fail; 99 goto fail;
101 path->slots[0]--; 100 path->slots[0]--;
102 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 101 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
103 if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY || 102 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
104 found_key.objectid != objectid) {
105 goto fail; 103 goto fail;
106 } 104
107 csum_offset = (offset - found_key.offset) >> 105 csum_offset = (bytenr - found_key.offset) >>
108 root->fs_info->sb->s_blocksize_bits; 106 root->fs_info->sb->s_blocksize_bits;
109 csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); 107 csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
110 csums_in_item /= csum_size; 108 csums_in_item /= csum_size;
@@ -143,7 +141,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
143} 141}
144 142
145int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 143int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
146 struct bio *bio) 144 struct bio *bio, u32 *dst)
147{ 145{
148 u32 sum; 146 u32 sum;
149 struct bio_vec *bvec = bio->bi_io_vec; 147 struct bio_vec *bvec = bio->bi_io_vec;
@@ -151,6 +149,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
151 u64 offset; 149 u64 offset;
152 u64 item_start_offset = 0; 150 u64 item_start_offset = 0;
153 u64 item_last_offset = 0; 151 u64 item_last_offset = 0;
152 u64 disk_bytenr;
154 u32 diff; 153 u32 diff;
155 u16 csum_size = 154 u16 csum_size =
156 btrfs_super_csum_size(&root->fs_info->super_copy); 155 btrfs_super_csum_size(&root->fs_info->super_copy);
@@ -165,21 +164,22 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
165 164
166 WARN_ON(bio->bi_vcnt <= 0); 165 WARN_ON(bio->bi_vcnt <= 0);
167 166
167 disk_bytenr = (u64)bio->bi_sector << 9;
168 while(bio_index < bio->bi_vcnt) { 168 while(bio_index < bio->bi_vcnt) {
169 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 169 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
170 ret = btrfs_find_ordered_sum(inode, offset, &sum); 170 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
171 if (ret == 0) 171 if (ret == 0)
172 goto found; 172 goto found;
173 173
174 if (!item || offset < item_start_offset || 174 if (!item || disk_bytenr < item_start_offset ||
175 offset >= item_last_offset) { 175 disk_bytenr >= item_last_offset) {
176 struct btrfs_key found_key; 176 struct btrfs_key found_key;
177 u32 item_size; 177 u32 item_size;
178 178
179 if (item) 179 if (item)
180 btrfs_release_path(root, path); 180 btrfs_release_path(root, path);
181 item = btrfs_lookup_csum(NULL, root, path, 181 item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
182 inode->i_ino, offset, 0); 182 path, disk_bytenr, 0);
183 if (IS_ERR(item)) { 183 if (IS_ERR(item)) {
184 ret = PTR_ERR(item); 184 ret = PTR_ERR(item);
185 if (ret == -ENOENT || ret == -EFBIG) 185 if (ret == -ENOENT || ret == -EFBIG)
@@ -208,7 +208,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
208 * this byte range must be able to fit inside 208 * this byte range must be able to fit inside
209 * a single leaf so it will also fit inside a u32 209 * a single leaf so it will also fit inside a u32
210 */ 210 */
211 diff = offset - item_start_offset; 211 diff = disk_bytenr - item_start_offset;
212 diff = diff / root->sectorsize; 212 diff = diff / root->sectorsize;
213 diff = diff * csum_size; 213 diff = diff * csum_size;
214 214
@@ -216,7 +216,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
216 ((unsigned long)item) + diff, 216 ((unsigned long)item) + diff,
217 csum_size); 217 csum_size);
218found: 218found:
219 set_state_private(io_tree, offset, sum); 219 if (dst)
220 *dst++ = sum;
221 else
222 set_state_private(io_tree, offset, sum);
223 disk_bytenr += bvec->bv_len;
220 bio_index++; 224 bio_index++;
221 bvec++; 225 bvec++;
222 } 226 }
@@ -224,75 +228,8 @@ found:
224 return 0; 228 return 0;
225} 229}
226 230
227int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
228 u64 start, unsigned long len)
229{
230 struct btrfs_ordered_sum *sums;
231 struct btrfs_sector_sum *sector_sum;
232 struct btrfs_ordered_extent *ordered;
233 char *data;
234 struct page *page;
235 unsigned long total_bytes = 0;
236 unsigned long this_sum_bytes = 0;
237
238 sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
239 if (!sums)
240 return -ENOMEM;
241
242 sector_sum = sums->sums;
243 sums->file_offset = start;
244 sums->len = len;
245 INIT_LIST_HEAD(&sums->list);
246 ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
247 BUG_ON(!ordered);
248
249 while(len > 0) {
250 if (start >= ordered->file_offset + ordered->len ||
251 start < ordered->file_offset) {
252 sums->len = this_sum_bytes;
253 this_sum_bytes = 0;
254 btrfs_add_ordered_sum(inode, ordered, sums);
255 btrfs_put_ordered_extent(ordered);
256
257 sums = kzalloc(btrfs_ordered_sum_size(root, len),
258 GFP_NOFS);
259 BUG_ON(!sums);
260 sector_sum = sums->sums;
261 sums->len = len;
262 sums->file_offset = start;
263 ordered = btrfs_lookup_ordered_extent(inode,
264 sums->file_offset);
265 BUG_ON(!ordered);
266 }
267
268 page = find_get_page(inode->i_mapping,
269 start >> PAGE_CACHE_SHIFT);
270
271 data = kmap_atomic(page, KM_USER0);
272 sector_sum->sum = ~(u32)0;
273 sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum,
274 PAGE_CACHE_SIZE);
275 kunmap_atomic(data, KM_USER0);
276 btrfs_csum_final(sector_sum->sum,
277 (char *)&sector_sum->sum);
278 sector_sum->offset = page_offset(page);
279 page_cache_release(page);
280
281 sector_sum++;
282 total_bytes += PAGE_CACHE_SIZE;
283 this_sum_bytes += PAGE_CACHE_SIZE;
284 start += PAGE_CACHE_SIZE;
285
286 WARN_ON(len < PAGE_CACHE_SIZE);
287 len -= PAGE_CACHE_SIZE;
288 }
289 btrfs_add_ordered_sum(inode, ordered, sums);
290 btrfs_put_ordered_extent(ordered);
291 return 0;
292}
293
294int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, 231int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
295 struct bio *bio) 232 struct bio *bio, u64 file_start, int contig)
296{ 233{
297 struct btrfs_ordered_sum *sums; 234 struct btrfs_ordered_sum *sums;
298 struct btrfs_sector_sum *sector_sum; 235 struct btrfs_sector_sum *sector_sum;
@@ -303,6 +240,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
303 unsigned long total_bytes = 0; 240 unsigned long total_bytes = 0;
304 unsigned long this_sum_bytes = 0; 241 unsigned long this_sum_bytes = 0;
305 u64 offset; 242 u64 offset;
243 u64 disk_bytenr;
306 244
307 WARN_ON(bio->bi_vcnt <= 0); 245 WARN_ON(bio->bi_vcnt <= 0);
308 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); 246 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
@@ -310,16 +248,25 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
310 return -ENOMEM; 248 return -ENOMEM;
311 249
312 sector_sum = sums->sums; 250 sector_sum = sums->sums;
313 sums->file_offset = page_offset(bvec->bv_page) + bvec->bv_offset; 251 disk_bytenr = (u64)bio->bi_sector << 9;
314 sums->len = bio->bi_size; 252 sums->len = bio->bi_size;
315 INIT_LIST_HEAD(&sums->list); 253 INIT_LIST_HEAD(&sums->list);
316 ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset); 254
255 if (contig)
256 offset = file_start;
257 else
258 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
259
260 ordered = btrfs_lookup_ordered_extent(inode, offset);
317 BUG_ON(!ordered); 261 BUG_ON(!ordered);
262 sums->bytenr = ordered->start;
318 263
319 while(bio_index < bio->bi_vcnt) { 264 while(bio_index < bio->bi_vcnt) {
320 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 265 if (!contig)
321 if (offset >= ordered->file_offset + ordered->len || 266 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
322 offset < ordered->file_offset) { 267
268 if (!contig && (offset >= ordered->file_offset + ordered->len ||
269 offset < ordered->file_offset)) {
323 unsigned long bytes_left; 270 unsigned long bytes_left;
324 sums->len = this_sum_bytes; 271 sums->len = this_sum_bytes;
325 this_sum_bytes = 0; 272 this_sum_bytes = 0;
@@ -333,10 +280,9 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
333 BUG_ON(!sums); 280 BUG_ON(!sums);
334 sector_sum = sums->sums; 281 sector_sum = sums->sums;
335 sums->len = bytes_left; 282 sums->len = bytes_left;
336 sums->file_offset = offset; 283 ordered = btrfs_lookup_ordered_extent(inode, offset);
337 ordered = btrfs_lookup_ordered_extent(inode,
338 sums->file_offset);
339 BUG_ON(!ordered); 284 BUG_ON(!ordered);
285 sums->bytenr = ordered->start;
340 } 286 }
341 287
342 data = kmap_atomic(bvec->bv_page, KM_USER0); 288 data = kmap_atomic(bvec->bv_page, KM_USER0);
@@ -348,13 +294,14 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
348 kunmap_atomic(data, KM_USER0); 294 kunmap_atomic(data, KM_USER0);
349 btrfs_csum_final(sector_sum->sum, 295 btrfs_csum_final(sector_sum->sum,
350 (char *)&sector_sum->sum); 296 (char *)&sector_sum->sum);
351 sector_sum->offset = page_offset(bvec->bv_page) + 297 sector_sum->bytenr = disk_bytenr;
352 bvec->bv_offset;
353 298
354 sector_sum++; 299 sector_sum++;
355 bio_index++; 300 bio_index++;
356 total_bytes += bvec->bv_len; 301 total_bytes += bvec->bv_len;
357 this_sum_bytes += bvec->bv_len; 302 this_sum_bytes += bvec->bv_len;
303 disk_bytenr += bvec->bv_len;
304 offset += bvec->bv_len;
358 bvec++; 305 bvec++;
359 } 306 }
360 this_sum_bytes = 0; 307 this_sum_bytes = 0;
@@ -364,11 +311,10 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
364} 311}
365 312
366int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 313int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
367 struct btrfs_root *root, struct inode *inode, 314 struct btrfs_root *root,
368 struct btrfs_ordered_sum *sums) 315 struct btrfs_ordered_sum *sums)
369{ 316{
370 u64 objectid = inode->i_ino; 317 u64 bytenr;
371 u64 offset;
372 int ret; 318 int ret;
373 struct btrfs_key file_key; 319 struct btrfs_key file_key;
374 struct btrfs_key found_key; 320 struct btrfs_key found_key;
@@ -396,13 +342,12 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
396again: 342again:
397 next_offset = (u64)-1; 343 next_offset = (u64)-1;
398 found_next = 0; 344 found_next = 0;
399 offset = sector_sum->offset; 345 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
400 file_key.objectid = objectid; 346 file_key.offset = sector_sum->bytenr;
401 file_key.offset = offset; 347 bytenr = sector_sum->bytenr;
402 btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY); 348 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
403 349
404 mutex_lock(&BTRFS_I(inode)->csum_mutex); 350 item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
405 item = btrfs_lookup_csum(trans, root, path, objectid, offset, 1);
406 if (!IS_ERR(item)) { 351 if (!IS_ERR(item)) {
407 leaf = path->nodes[0]; 352 leaf = path->nodes[0];
408 ret = 0; 353 ret = 0;
@@ -432,8 +377,8 @@ again:
432 slot = 0; 377 slot = 0;
433 } 378 }
434 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); 379 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
435 if (found_key.objectid != objectid || 380 if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
436 found_key.type != BTRFS_CSUM_ITEM_KEY) { 381 found_key.type != BTRFS_EXTENT_CSUM_KEY) {
437 found_next = 1; 382 found_next = 1;
438 goto insert; 383 goto insert;
439 } 384 }
@@ -460,10 +405,10 @@ again:
460 path->slots[0]--; 405 path->slots[0]--;
461 leaf = path->nodes[0]; 406 leaf = path->nodes[0];
462 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 407 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
463 csum_offset = (offset - found_key.offset) >> 408 csum_offset = (bytenr - found_key.offset) >>
464 root->fs_info->sb->s_blocksize_bits; 409 root->fs_info->sb->s_blocksize_bits;
465 if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY || 410 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
466 found_key.objectid != objectid || 411 found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
467 csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { 412 csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
468 goto insert; 413 goto insert;
469 } 414 }
@@ -482,8 +427,18 @@ insert:
482 btrfs_release_path(root, path); 427 btrfs_release_path(root, path);
483 csum_offset = 0; 428 csum_offset = 0;
484 if (found_next) { 429 if (found_next) {
485 u64 tmp = min((u64)i_size_read(inode), next_offset); 430 u64 tmp = total_bytes + root->sectorsize;
486 tmp -= offset & ~((u64)root->sectorsize -1); 431 u64 next_sector = sector_sum->bytenr;
432 struct btrfs_sector_sum *next = sector_sum + 1;
433
434 while(tmp < sums->len) {
435 if (next_sector + root->sectorsize != next->bytenr)
436 break;
437 tmp += root->sectorsize;
438 next_sector = next->bytenr;
439 next++;
440 }
441 tmp = min(tmp, next_offset - file_key.offset);
487 tmp >>= root->fs_info->sb->s_blocksize_bits; 442 tmp >>= root->fs_info->sb->s_blocksize_bits;
488 tmp = max((u64)1, tmp); 443 tmp = max((u64)1, tmp);
489 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); 444 tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
@@ -510,7 +465,6 @@ found:
510 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 465 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
511 btrfs_item_size_nr(leaf, path->slots[0])); 466 btrfs_item_size_nr(leaf, path->slots[0]));
512 eb_token = NULL; 467 eb_token = NULL;
513 mutex_unlock(&BTRFS_I(inode)->csum_mutex);
514 cond_resched(); 468 cond_resched();
515next_sector: 469next_sector:
516 470
@@ -541,9 +495,9 @@ next_sector:
541 if (total_bytes < sums->len) { 495 if (total_bytes < sums->len) {
542 item = (struct btrfs_csum_item *)((char *)item + 496 item = (struct btrfs_csum_item *)((char *)item +
543 csum_size); 497 csum_size);
544 if (item < item_end && offset + PAGE_CACHE_SIZE == 498 if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
545 sector_sum->offset) { 499 sector_sum->bytenr) {
546 offset = sector_sum->offset; 500 bytenr = sector_sum->bytenr;
547 goto next_sector; 501 goto next_sector;
548 } 502 }
549 } 503 }
@@ -562,7 +516,6 @@ out:
562 return ret; 516 return ret;
563 517
564fail_unlock: 518fail_unlock:
565 mutex_unlock(&BTRFS_I(inode)->csum_mutex);
566 goto out; 519 goto out;
567} 520}
568 521