diff options
author | Chris Mason <chris.mason@oracle.com> | 2008-10-01 19:05:46 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2008-10-01 19:05:46 -0400 |
commit | 323ac95bce442bbde514e3ce57e840402f80d909 (patch) | |
tree | 9b2241ed4de746087fdfb66f83fc142279648f9b /fs | |
parent | cf749823857230017c86504bfdc70524f929ba96 (diff) |
Btrfs: don't read leaf blocks containing only checksums during truncate
Checksum items take up a significant portion of the metadata for large files.
It is possible to avoid reading them during truncates by checking the keys in
the higher level nodes.
If a given leaf is followed by another leaf where the lowest key is a checksum
item from the same file, we know we can safely delete the leaf without
reading it.
For a 32GB file on a 6 drive raid0 array, Btrfs needs 8s to delete
the file with a cold cache. It is read bound during the run.
With this change, Btrfs is able to delete the file in 0.5s
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/ctree.c | 69 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 4 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 152 |
3 files changed, 193 insertions, 32 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ff3261ff2e19..2eab4643dcbc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c | |||
@@ -1388,7 +1388,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root | |||
1388 | struct btrfs_key prealloc_block; | 1388 | struct btrfs_key prealloc_block; |
1389 | 1389 | ||
1390 | lowest_level = p->lowest_level; | 1390 | lowest_level = p->lowest_level; |
1391 | WARN_ON(lowest_level && ins_len); | 1391 | WARN_ON(lowest_level && ins_len > 0); |
1392 | WARN_ON(p->nodes[0] != NULL); | 1392 | WARN_ON(p->nodes[0] != NULL); |
1393 | WARN_ON(cow && root == root->fs_info->extent_root && | 1393 | WARN_ON(cow && root == root->fs_info->extent_root && |
1394 | !mutex_is_locked(&root->fs_info->alloc_mutex)); | 1394 | !mutex_is_locked(&root->fs_info->alloc_mutex)); |
@@ -3187,6 +3187,36 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
3187 | } | 3187 | } |
3188 | 3188 | ||
3189 | /* | 3189 | /* |
3190 | * a helper function to delete the leaf pointed to by path->slots[1] and | ||
3191 | * path->nodes[1]. bytenr is the node block pointer, but since the callers | ||
3192 | * already know it, it is faster to have them pass it down than to | ||
3193 | * read it out of the node again. | ||
3194 | * | ||
3195 | * This deletes the pointer in path->nodes[1] and frees the leaf | ||
3196 | * block extent. zero is returned if it all worked out, < 0 otherwise. | ||
3197 | * | ||
3198 | * The path must have already been setup for deleting the leaf, including | ||
3199 | * all the proper balancing. path->nodes[1] must be locked. | ||
3200 | */ | ||
3201 | noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, | ||
3202 | struct btrfs_root *root, | ||
3203 | struct btrfs_path *path, u64 bytenr) | ||
3204 | { | ||
3205 | int ret; | ||
3206 | u64 root_gen = btrfs_header_generation(path->nodes[1]); | ||
3207 | |||
3208 | ret = del_ptr(trans, root, path, 1, path->slots[1]); | ||
3209 | if (ret) | ||
3210 | return ret; | ||
3211 | |||
3212 | ret = btrfs_free_extent(trans, root, bytenr, | ||
3213 | btrfs_level_size(root, 0), | ||
3214 | path->nodes[1]->start, | ||
3215 | btrfs_header_owner(path->nodes[1]), | ||
3216 | root_gen, 0, 0, 1); | ||
3217 | return ret; | ||
3218 | } | ||
3219 | /* | ||
3190 | * delete the item at the leaf level in path. If that empties | 3220 | * delete the item at the leaf level in path. If that empties |
3191 | * the leaf, remove it from the tree | 3221 | * the leaf, remove it from the tree |
3192 | */ | 3222 | */ |
@@ -3251,17 +3281,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
3251 | if (leaf == root->node) { | 3281 | if (leaf == root->node) { |
3252 | btrfs_set_header_level(leaf, 0); | 3282 | btrfs_set_header_level(leaf, 0); |
3253 | } else { | 3283 | } else { |
3254 | u64 root_gen = btrfs_header_generation(path->nodes[1]); | 3284 | ret = btrfs_del_leaf(trans, root, path, leaf->start); |
3255 | wret = del_ptr(trans, root, path, 1, path->slots[1]); | 3285 | BUG_ON(ret); |
3256 | if (wret) | ||
3257 | ret = wret; | ||
3258 | wret = btrfs_free_extent(trans, root, | ||
3259 | leaf->start, leaf->len, | ||
3260 | path->nodes[1]->start, | ||
3261 | btrfs_header_owner(path->nodes[1]), | ||
3262 | root_gen, 0, 0, 1); | ||
3263 | if (wret) | ||
3264 | ret = wret; | ||
3265 | } | 3286 | } |
3266 | } else { | 3287 | } else { |
3267 | int used = leaf_space_used(leaf, 0, nritems); | 3288 | int used = leaf_space_used(leaf, 0, nritems); |
@@ -3296,24 +3317,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
3296 | } | 3317 | } |
3297 | 3318 | ||
3298 | if (btrfs_header_nritems(leaf) == 0) { | 3319 | if (btrfs_header_nritems(leaf) == 0) { |
3299 | u64 root_gen; | 3320 | path->slots[1] = slot; |
3300 | u64 bytenr = leaf->start; | 3321 | ret = btrfs_del_leaf(trans, root, path, leaf->start); |
3301 | u32 blocksize = leaf->len; | 3322 | BUG_ON(ret); |
3302 | |||
3303 | root_gen = btrfs_header_generation( | ||
3304 | path->nodes[1]); | ||
3305 | |||
3306 | wret = del_ptr(trans, root, path, 1, slot); | ||
3307 | if (wret) | ||
3308 | ret = wret; | ||
3309 | |||
3310 | free_extent_buffer(leaf); | 3323 | free_extent_buffer(leaf); |
3311 | wret = btrfs_free_extent(trans, root, bytenr, | ||
3312 | blocksize, path->nodes[1]->start, | ||
3313 | btrfs_header_owner(path->nodes[1]), | ||
3314 | root_gen, 0, 0, 1); | ||
3315 | if (wret) | ||
3316 | ret = wret; | ||
3317 | } else { | 3324 | } else { |
3318 | /* if we're still in the path, make sure | 3325 | /* if we're still in the path, make sure |
3319 | * we're dirty. Otherwise, one of the | 3326 | * we're dirty. Otherwise, one of the |
@@ -3418,8 +3425,8 @@ again: | |||
3418 | level = btrfs_header_level(cur); | 3425 | level = btrfs_header_level(cur); |
3419 | sret = bin_search(cur, min_key, level, &slot); | 3426 | sret = bin_search(cur, min_key, level, &slot); |
3420 | 3427 | ||
3421 | /* at level = 0, we're done, setup the path and exit */ | 3428 | /* at the lowest level, we're done, setup the path and exit */ |
3422 | if (level == 0) { | 3429 | if (level == path->lowest_level) { |
3423 | if (slot >= nritems) | 3430 | if (slot >= nritems) |
3424 | goto find_next_key; | 3431 | goto find_next_key; |
3425 | ret = 0; | 3432 | ret = 0; |
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ded1643c0273..94e0cdfddc0c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -1649,7 +1649,9 @@ void btrfs_free_path(struct btrfs_path *p); | |||
1649 | void btrfs_init_path(struct btrfs_path *p); | 1649 | void btrfs_init_path(struct btrfs_path *p); |
1650 | int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, | 1650 | int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, |
1651 | struct btrfs_path *path, int slot, int nr); | 1651 | struct btrfs_path *path, int slot, int nr); |
1652 | 1652 | int btrfs_del_leaf(struct btrfs_trans_handle *trans, | |
1653 | struct btrfs_root *root, | ||
1654 | struct btrfs_path *path, u64 bytenr); | ||
1653 | static inline int btrfs_del_item(struct btrfs_trans_handle *trans, | 1655 | static inline int btrfs_del_item(struct btrfs_trans_handle *trans, |
1654 | struct btrfs_root *root, | 1656 | struct btrfs_root *root, |
1655 | struct btrfs_path *path) | 1657 | struct btrfs_path *path) |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f3abecc2d14c..e5c9261dcbaa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -1390,6 +1390,154 @@ fail: | |||
1390 | } | 1390 | } |
1391 | 1391 | ||
1392 | /* | 1392 | /* |
1393 | * when truncating bytes in a file, it is possible to avoid reading | ||
1394 | * the leaves that contain only checksum items. This can be the | ||
1395 | * majority of the IO required to delete a large file, but it must | ||
1396 | * be done carefully. | ||
1397 | * | ||
1398 | * The keys in the level just above the leaves are checked to make sure | ||
1399 | * the lowest key in a given leaf is a csum key, and starts at an offset | ||
1400 | * after the new size. | ||
1401 | * | ||
1402 | * Then the key for the next leaf is checked to make sure it also has | ||
1403 | * a checksum item for the same file. If it does, we know our target leaf | ||
1404 | * contains only checksum items, and it can be safely freed without reading | ||
1405 | * it. | ||
1406 | * | ||
1407 | * This is just an optimization targeted at large files. It may do | ||
1408 | * nothing. It will return 0 unless things went badly. | ||
1409 | */ | ||
1410 | static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans, | ||
1411 | struct btrfs_root *root, | ||
1412 | struct btrfs_path *path, | ||
1413 | struct inode *inode, u64 new_size) | ||
1414 | { | ||
1415 | struct btrfs_key key; | ||
1416 | int ret; | ||
1417 | int nritems; | ||
1418 | struct btrfs_key found_key; | ||
1419 | struct btrfs_key other_key; | ||
1420 | |||
1421 | path->lowest_level = 1; | ||
1422 | key.objectid = inode->i_ino; | ||
1423 | key.type = BTRFS_CSUM_ITEM_KEY; | ||
1424 | key.offset = new_size; | ||
1425 | again: | ||
1426 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | ||
1427 | if (ret < 0) | ||
1428 | goto out; | ||
1429 | |||
1430 | if (path->nodes[1] == NULL) { | ||
1431 | ret = 0; | ||
1432 | goto out; | ||
1433 | } | ||
1434 | ret = 0; | ||
1435 | btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]); | ||
1436 | nritems = btrfs_header_nritems(path->nodes[1]); | ||
1437 | |||
1438 | if (!nritems) | ||
1439 | goto out; | ||
1440 | |||
1441 | if (path->slots[1] >= nritems) | ||
1442 | goto next_node; | ||
1443 | |||
1444 | /* did we find a key greater than anything we want to delete? */ | ||
1445 | if (found_key.objectid > inode->i_ino || | ||
1446 | (found_key.objectid == inode->i_ino && found_key.type > key.type)) | ||
1447 | goto out; | ||
1448 | |||
1449 | /* we check the next key in the node to make sure the leave contains | ||
1450 | * only checksum items. This comparison doesn't work if our | ||
1451 | * leaf is the last one in the node | ||
1452 | */ | ||
1453 | if (path->slots[1] + 1 >= nritems) { | ||
1454 | next_node: | ||
1455 | /* search forward from the last key in the node, this | ||
1456 | * will bring us into the next node in the tree | ||
1457 | */ | ||
1458 | btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1); | ||
1459 | |||
1460 | /* unlikely, but we inc below, so check to be safe */ | ||
1461 | if (found_key.offset == (u64)-1) | ||
1462 | goto out; | ||
1463 | |||
1464 | /* search_forward needs a path with locks held, do the | ||
1465 | * search again for the original key. It is possible | ||
1466 | * this will race with a balance and return a path that | ||
1467 | * we could modify, but this drop is just an optimization | ||
1468 | * and is allowed to miss some leaves. | ||
1469 | */ | ||
1470 | btrfs_release_path(root, path); | ||
1471 | found_key.offset++; | ||
1472 | |||
1473 | /* setup a max key for search_forward */ | ||
1474 | other_key.offset = (u64)-1; | ||
1475 | other_key.type = key.type; | ||
1476 | other_key.objectid = key.objectid; | ||
1477 | |||
1478 | path->keep_locks = 1; | ||
1479 | ret = btrfs_search_forward(root, &found_key, &other_key, | ||
1480 | path, 0, 0); | ||
1481 | path->keep_locks = 0; | ||
1482 | if (ret || found_key.objectid != key.objectid || | ||
1483 | found_key.type != key.type) { | ||
1484 | ret = 0; | ||
1485 | goto out; | ||
1486 | } | ||
1487 | |||
1488 | key.offset = found_key.offset; | ||
1489 | btrfs_release_path(root, path); | ||
1490 | cond_resched(); | ||
1491 | goto again; | ||
1492 | } | ||
1493 | |||
1494 | /* we know there's one more slot after us in the tree, | ||
1495 | * read that key so we can verify it is also a checksum item | ||
1496 | */ | ||
1497 | btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1); | ||
1498 | |||
1499 | if (found_key.objectid < inode->i_ino) | ||
1500 | goto next_key; | ||
1501 | |||
1502 | if (found_key.type != key.type || found_key.offset < new_size) | ||
1503 | goto next_key; | ||
1504 | |||
1505 | /* | ||
1506 | * if the key for the next leaf isn't a csum key from this objectid, | ||
1507 | * we can't be sure there aren't good items inside this leaf. | ||
1508 | * Bail out | ||
1509 | */ | ||
1510 | if (other_key.objectid != inode->i_ino || other_key.type != key.type) | ||
1511 | goto out; | ||
1512 | |||
1513 | /* | ||
1514 | * it is safe to delete this leaf, it contains only | ||
1515 | * csum items from this inode at an offset >= new_size | ||
1516 | */ | ||
1517 | ret = btrfs_del_leaf(trans, root, path, | ||
1518 | btrfs_node_blockptr(path->nodes[1], | ||
1519 | path->slots[1])); | ||
1520 | BUG_ON(ret); | ||
1521 | |||
1522 | next_key: | ||
1523 | btrfs_release_path(root, path); | ||
1524 | |||
1525 | if (other_key.objectid == inode->i_ino && | ||
1526 | other_key.type == key.type && other_key.offset > key.offset) { | ||
1527 | key.offset = other_key.offset; | ||
1528 | cond_resched(); | ||
1529 | goto again; | ||
1530 | } | ||
1531 | ret = 0; | ||
1532 | out: | ||
1533 | /* fixup any changes we've made to the path */ | ||
1534 | path->lowest_level = 0; | ||
1535 | path->keep_locks = 0; | ||
1536 | btrfs_release_path(root, path); | ||
1537 | return ret; | ||
1538 | } | ||
1539 | |||
1540 | /* | ||
1393 | * this can truncate away extent items, csum items and directory items. | 1541 | * this can truncate away extent items, csum items and directory items. |
1394 | * It starts at a high offset and removes keys until it can't find | 1542 | * It starts at a high offset and removes keys until it can't find |
1395 | * any higher than new_size | 1543 | * any higher than new_size |
@@ -1436,6 +1584,10 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, | |||
1436 | key.type = (u8)-1; | 1584 | key.type = (u8)-1; |
1437 | 1585 | ||
1438 | btrfs_init_path(path); | 1586 | btrfs_init_path(path); |
1587 | |||
1588 | ret = drop_csum_leaves(trans, root, path, inode, new_size); | ||
1589 | BUG_ON(ret); | ||
1590 | |||
1439 | search_again: | 1591 | search_again: |
1440 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | 1592 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); |
1441 | if (ret < 0) { | 1593 | if (ret < 0) { |