aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-10-01 19:05:46 -0400
committerChris Mason <chris.mason@oracle.com>2008-10-01 19:05:46 -0400
commit323ac95bce442bbde514e3ce57e840402f80d909 (patch)
tree9b2241ed4de746087fdfb66f83fc142279648f9b
parentcf749823857230017c86504bfdc70524f929ba96 (diff)
Btrfs: don't read leaf blocks containing only checksums during truncate
Checksum items take up a significant portion of the metadata for large files. It is possible to avoid reading them during truncates by checking the keys in the higher level nodes. If a given leaf is followed by another leaf where the lowest key is a checksum item from the same file, we know we can safely delete the leaf without reading it. For a 32GB file on a 6 drive raid0 array, Btrfs needs 8s to delete the file with a cold cache. It is read bound during the run. With this change, Btrfs is able to delete the file in 0.5s Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/ctree.c69
-rw-r--r--fs/btrfs/ctree.h4
-rw-r--r--fs/btrfs/inode.c152
3 files changed, 193 insertions, 32 deletions
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ff3261ff2e19..2eab4643dcbc 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1388,7 +1388,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1388 struct btrfs_key prealloc_block; 1388 struct btrfs_key prealloc_block;
1389 1389
1390 lowest_level = p->lowest_level; 1390 lowest_level = p->lowest_level;
1391 WARN_ON(lowest_level && ins_len); 1391 WARN_ON(lowest_level && ins_len > 0);
1392 WARN_ON(p->nodes[0] != NULL); 1392 WARN_ON(p->nodes[0] != NULL);
1393 WARN_ON(cow && root == root->fs_info->extent_root && 1393 WARN_ON(cow && root == root->fs_info->extent_root &&
1394 !mutex_is_locked(&root->fs_info->alloc_mutex)); 1394 !mutex_is_locked(&root->fs_info->alloc_mutex));
@@ -3187,6 +3187,36 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3187} 3187}
3188 3188
3189/* 3189/*
3190 * a helper function to delete the leaf pointed to by path->slots[1] and
3191 * path->nodes[1]. bytenr is the node block pointer, but since the callers
3192 * already know it, it is faster to have them pass it down than to
3193 * read it out of the node again.
3194 *
3195 * This deletes the pointer in path->nodes[1] and frees the leaf
3196 * block extent. zero is returned if it all worked out, < 0 otherwise.
3197 *
3198 * The path must have already been setup for deleting the leaf, including
3199 * all the proper balancing. path->nodes[1] must be locked.
3200 */
3201noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3202 struct btrfs_root *root,
3203 struct btrfs_path *path, u64 bytenr)
3204{
3205 int ret;
3206 u64 root_gen = btrfs_header_generation(path->nodes[1]);
3207
3208 ret = del_ptr(trans, root, path, 1, path->slots[1]);
3209 if (ret)
3210 return ret;
3211
3212 ret = btrfs_free_extent(trans, root, bytenr,
3213 btrfs_level_size(root, 0),
3214 path->nodes[1]->start,
3215 btrfs_header_owner(path->nodes[1]),
3216 root_gen, 0, 0, 1);
3217 return ret;
3218}
3219/*
3190 * delete the item at the leaf level in path. If that empties 3220 * delete the item at the leaf level in path. If that empties
3191 * the leaf, remove it from the tree 3221 * the leaf, remove it from the tree
3192 */ 3222 */
@@ -3251,17 +3281,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3251 if (leaf == root->node) { 3281 if (leaf == root->node) {
3252 btrfs_set_header_level(leaf, 0); 3282 btrfs_set_header_level(leaf, 0);
3253 } else { 3283 } else {
3254 u64 root_gen = btrfs_header_generation(path->nodes[1]); 3284 ret = btrfs_del_leaf(trans, root, path, leaf->start);
3255 wret = del_ptr(trans, root, path, 1, path->slots[1]); 3285 BUG_ON(ret);
3256 if (wret)
3257 ret = wret;
3258 wret = btrfs_free_extent(trans, root,
3259 leaf->start, leaf->len,
3260 path->nodes[1]->start,
3261 btrfs_header_owner(path->nodes[1]),
3262 root_gen, 0, 0, 1);
3263 if (wret)
3264 ret = wret;
3265 } 3286 }
3266 } else { 3287 } else {
3267 int used = leaf_space_used(leaf, 0, nritems); 3288 int used = leaf_space_used(leaf, 0, nritems);
@@ -3296,24 +3317,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3296 } 3317 }
3297 3318
3298 if (btrfs_header_nritems(leaf) == 0) { 3319 if (btrfs_header_nritems(leaf) == 0) {
3299 u64 root_gen; 3320 path->slots[1] = slot;
3300 u64 bytenr = leaf->start; 3321 ret = btrfs_del_leaf(trans, root, path, leaf->start);
3301 u32 blocksize = leaf->len; 3322 BUG_ON(ret);
3302
3303 root_gen = btrfs_header_generation(
3304 path->nodes[1]);
3305
3306 wret = del_ptr(trans, root, path, 1, slot);
3307 if (wret)
3308 ret = wret;
3309
3310 free_extent_buffer(leaf); 3323 free_extent_buffer(leaf);
3311 wret = btrfs_free_extent(trans, root, bytenr,
3312 blocksize, path->nodes[1]->start,
3313 btrfs_header_owner(path->nodes[1]),
3314 root_gen, 0, 0, 1);
3315 if (wret)
3316 ret = wret;
3317 } else { 3324 } else {
3318 /* if we're still in the path, make sure 3325 /* if we're still in the path, make sure
3319 * we're dirty. Otherwise, one of the 3326 * we're dirty. Otherwise, one of the
@@ -3418,8 +3425,8 @@ again:
3418 level = btrfs_header_level(cur); 3425 level = btrfs_header_level(cur);
3419 sret = bin_search(cur, min_key, level, &slot); 3426 sret = bin_search(cur, min_key, level, &slot);
3420 3427
3421 /* at level = 0, we're done, setup the path and exit */ 3428 /* at the lowest level, we're done, setup the path and exit */
3422 if (level == 0) { 3429 if (level == path->lowest_level) {
3423 if (slot >= nritems) 3430 if (slot >= nritems)
3424 goto find_next_key; 3431 goto find_next_key;
3425 ret = 0; 3432 ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ded1643c0273..94e0cdfddc0c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1649,7 +1649,9 @@ void btrfs_free_path(struct btrfs_path *p);
1649void btrfs_init_path(struct btrfs_path *p); 1649void btrfs_init_path(struct btrfs_path *p);
1650int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1650int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1651 struct btrfs_path *path, int slot, int nr); 1651 struct btrfs_path *path, int slot, int nr);
1652 1652int btrfs_del_leaf(struct btrfs_trans_handle *trans,
1653 struct btrfs_root *root,
1654 struct btrfs_path *path, u64 bytenr);
1653static inline int btrfs_del_item(struct btrfs_trans_handle *trans, 1655static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
1654 struct btrfs_root *root, 1656 struct btrfs_root *root,
1655 struct btrfs_path *path) 1657 struct btrfs_path *path)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f3abecc2d14c..e5c9261dcbaa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1390,6 +1390,154 @@ fail:
1390} 1390}
1391 1391
1392/* 1392/*
1393 * when truncating bytes in a file, it is possible to avoid reading
1394 * the leaves that contain only checksum items. This can be the
1395 * majority of the IO required to delete a large file, but it must
1396 * be done carefully.
1397 *
1398 * The keys in the level just above the leaves are checked to make sure
1399 * the lowest key in a given leaf is a csum key, and starts at an offset
1400 * after the new size.
1401 *
1402 * Then the key for the next leaf is checked to make sure it also has
1403 * a checksum item for the same file. If it does, we know our target leaf
1404 * contains only checksum items, and it can be safely freed without reading
1405 * it.
1406 *
1407 * This is just an optimization targeted at large files. It may do
1408 * nothing. It will return 0 unless things went badly.
1409 */
1410static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
1411 struct btrfs_root *root,
1412 struct btrfs_path *path,
1413 struct inode *inode, u64 new_size)
1414{
1415 struct btrfs_key key;
1416 int ret;
1417 int nritems;
1418 struct btrfs_key found_key;
1419 struct btrfs_key other_key;
1420
1421 path->lowest_level = 1;
1422 key.objectid = inode->i_ino;
1423 key.type = BTRFS_CSUM_ITEM_KEY;
1424 key.offset = new_size;
1425again:
1426 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1427 if (ret < 0)
1428 goto out;
1429
1430 if (path->nodes[1] == NULL) {
1431 ret = 0;
1432 goto out;
1433 }
1434 ret = 0;
1435 btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
1436 nritems = btrfs_header_nritems(path->nodes[1]);
1437
1438 if (!nritems)
1439 goto out;
1440
1441 if (path->slots[1] >= nritems)
1442 goto next_node;
1443
1444 /* did we find a key greater than anything we want to delete? */
1445 if (found_key.objectid > inode->i_ino ||
1446 (found_key.objectid == inode->i_ino && found_key.type > key.type))
1447 goto out;
1448
1449 /* we check the next key in the node to make sure the leave contains
1450 * only checksum items. This comparison doesn't work if our
1451 * leaf is the last one in the node
1452 */
1453 if (path->slots[1] + 1 >= nritems) {
1454next_node:
1455 /* search forward from the last key in the node, this
1456 * will bring us into the next node in the tree
1457 */
1458 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
1459
1460 /* unlikely, but we inc below, so check to be safe */
1461 if (found_key.offset == (u64)-1)
1462 goto out;
1463
1464 /* search_forward needs a path with locks held, do the
1465 * search again for the original key. It is possible
1466 * this will race with a balance and return a path that
1467 * we could modify, but this drop is just an optimization
1468 * and is allowed to miss some leaves.
1469 */
1470 btrfs_release_path(root, path);
1471 found_key.offset++;
1472
1473 /* setup a max key for search_forward */
1474 other_key.offset = (u64)-1;
1475 other_key.type = key.type;
1476 other_key.objectid = key.objectid;
1477
1478 path->keep_locks = 1;
1479 ret = btrfs_search_forward(root, &found_key, &other_key,
1480 path, 0, 0);
1481 path->keep_locks = 0;
1482 if (ret || found_key.objectid != key.objectid ||
1483 found_key.type != key.type) {
1484 ret = 0;
1485 goto out;
1486 }
1487
1488 key.offset = found_key.offset;
1489 btrfs_release_path(root, path);
1490 cond_resched();
1491 goto again;
1492 }
1493
1494 /* we know there's one more slot after us in the tree,
1495 * read that key so we can verify it is also a checksum item
1496 */
1497 btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
1498
1499 if (found_key.objectid < inode->i_ino)
1500 goto next_key;
1501
1502 if (found_key.type != key.type || found_key.offset < new_size)
1503 goto next_key;
1504
1505 /*
1506 * if the key for the next leaf isn't a csum key from this objectid,
1507 * we can't be sure there aren't good items inside this leaf.
1508 * Bail out
1509 */
1510 if (other_key.objectid != inode->i_ino || other_key.type != key.type)
1511 goto out;
1512
1513 /*
1514 * it is safe to delete this leaf, it contains only
1515 * csum items from this inode at an offset >= new_size
1516 */
1517 ret = btrfs_del_leaf(trans, root, path,
1518 btrfs_node_blockptr(path->nodes[1],
1519 path->slots[1]));
1520 BUG_ON(ret);
1521
1522next_key:
1523 btrfs_release_path(root, path);
1524
1525 if (other_key.objectid == inode->i_ino &&
1526 other_key.type == key.type && other_key.offset > key.offset) {
1527 key.offset = other_key.offset;
1528 cond_resched();
1529 goto again;
1530 }
1531 ret = 0;
1532out:
1533 /* fixup any changes we've made to the path */
1534 path->lowest_level = 0;
1535 path->keep_locks = 0;
1536 btrfs_release_path(root, path);
1537 return ret;
1538}
1539
1540/*
1393 * this can truncate away extent items, csum items and directory items. 1541 * this can truncate away extent items, csum items and directory items.
1394 * It starts at a high offset and removes keys until it can't find 1542 * It starts at a high offset and removes keys until it can't find
1395 * any higher than new_size 1543 * any higher than new_size
@@ -1436,6 +1584,10 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
1436 key.type = (u8)-1; 1584 key.type = (u8)-1;
1437 1585
1438 btrfs_init_path(path); 1586 btrfs_init_path(path);
1587
1588 ret = drop_csum_leaves(trans, root, path, inode, new_size);
1589 BUG_ON(ret);
1590
1439search_again: 1591search_again:
1440 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1592 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1441 if (ret < 0) { 1593 if (ret < 0) {