aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-10-01 19:05:46 -0400
committerChris Mason <chris.mason@oracle.com>2008-10-01 19:05:46 -0400
commit323ac95bce442bbde514e3ce57e840402f80d909 (patch)
tree9b2241ed4de746087fdfb66f83fc142279648f9b /fs/btrfs/inode.c
parentcf749823857230017c86504bfdc70524f929ba96 (diff)
Btrfs: don't read leaf blocks containing only checksums during truncate
Checksum items take up a significant portion of the metadata for large files. It is possible to avoid reading them during truncates by checking the keys in the higher level nodes. If a given leaf is followed by another leaf where the lowest key is a checksum item from the same file, we know we can safely delete the leaf without reading it. For a 32GB file on a 6 drive raid0 array, Btrfs needs 8s to delete the file with a cold cache. It is read bound during the run. With this change, Btrfs is able to delete the file in 0.5s Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c152
1 files changed, 152 insertions, 0 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f3abecc2d14c..e5c9261dcbaa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1390,6 +1390,154 @@ fail:
1390} 1390}
1391 1391
1392/* 1392/*
1393 * when truncating bytes in a file, it is possible to avoid reading
1394 * the leaves that contain only checksum items. This can be the
1395 * majority of the IO required to delete a large file, but it must
1396 * be done carefully.
1397 *
1398 * The keys in the level just above the leaves are checked to make sure
1399 * the lowest key in a given leaf is a csum key, and starts at an offset
1400 * after the new size.
1401 *
1402 * Then the key for the next leaf is checked to make sure it also has
1403 * a checksum item for the same file. If it does, we know our target leaf
1404 * contains only checksum items, and it can be safely freed without reading
1405 * it.
1406 *
1407 * This is just an optimization targeted at large files. It may do
1408 * nothing. It will return 0 unless things went badly.
1409 */
1410static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
1411 struct btrfs_root *root,
1412 struct btrfs_path *path,
1413 struct inode *inode, u64 new_size)
1414{
1415 struct btrfs_key key;
1416 int ret;
1417 int nritems;
1418 struct btrfs_key found_key;
1419 struct btrfs_key other_key;
1420
1421 path->lowest_level = 1;
1422 key.objectid = inode->i_ino;
1423 key.type = BTRFS_CSUM_ITEM_KEY;
1424 key.offset = new_size;
1425again:
1426 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1427 if (ret < 0)
1428 goto out;
1429
1430 if (path->nodes[1] == NULL) {
1431 ret = 0;
1432 goto out;
1433 }
1434 ret = 0;
1435 btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
1436 nritems = btrfs_header_nritems(path->nodes[1]);
1437
1438 if (!nritems)
1439 goto out;
1440
1441 if (path->slots[1] >= nritems)
1442 goto next_node;
1443
1444 /* did we find a key greater than anything we want to delete? */
1445 if (found_key.objectid > inode->i_ino ||
1446 (found_key.objectid == inode->i_ino && found_key.type > key.type))
1447 goto out;
1448
1449 /* we check the next key in the node to make sure the leave contains
1450 * only checksum items. This comparison doesn't work if our
1451 * leaf is the last one in the node
1452 */
1453 if (path->slots[1] + 1 >= nritems) {
1454next_node:
1455 /* search forward from the last key in the node, this
1456 * will bring us into the next node in the tree
1457 */
1458 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
1459
1460 /* unlikely, but we inc below, so check to be safe */
1461 if (found_key.offset == (u64)-1)
1462 goto out;
1463
1464 /* search_forward needs a path with locks held, do the
1465 * search again for the original key. It is possible
1466 * this will race with a balance and return a path that
1467 * we could modify, but this drop is just an optimization
1468 * and is allowed to miss some leaves.
1469 */
1470 btrfs_release_path(root, path);
1471 found_key.offset++;
1472
1473 /* setup a max key for search_forward */
1474 other_key.offset = (u64)-1;
1475 other_key.type = key.type;
1476 other_key.objectid = key.objectid;
1477
1478 path->keep_locks = 1;
1479 ret = btrfs_search_forward(root, &found_key, &other_key,
1480 path, 0, 0);
1481 path->keep_locks = 0;
1482 if (ret || found_key.objectid != key.objectid ||
1483 found_key.type != key.type) {
1484 ret = 0;
1485 goto out;
1486 }
1487
1488 key.offset = found_key.offset;
1489 btrfs_release_path(root, path);
1490 cond_resched();
1491 goto again;
1492 }
1493
1494 /* we know there's one more slot after us in the tree,
1495 * read that key so we can verify it is also a checksum item
1496 */
1497 btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
1498
1499 if (found_key.objectid < inode->i_ino)
1500 goto next_key;
1501
1502 if (found_key.type != key.type || found_key.offset < new_size)
1503 goto next_key;
1504
1505 /*
1506 * if the key for the next leaf isn't a csum key from this objectid,
1507 * we can't be sure there aren't good items inside this leaf.
1508 * Bail out
1509 */
1510 if (other_key.objectid != inode->i_ino || other_key.type != key.type)
1511 goto out;
1512
1513 /*
1514 * it is safe to delete this leaf, it contains only
1515 * csum items from this inode at an offset >= new_size
1516 */
1517 ret = btrfs_del_leaf(trans, root, path,
1518 btrfs_node_blockptr(path->nodes[1],
1519 path->slots[1]));
1520 BUG_ON(ret);
1521
1522next_key:
1523 btrfs_release_path(root, path);
1524
1525 if (other_key.objectid == inode->i_ino &&
1526 other_key.type == key.type && other_key.offset > key.offset) {
1527 key.offset = other_key.offset;
1528 cond_resched();
1529 goto again;
1530 }
1531 ret = 0;
1532out:
1533 /* fixup any changes we've made to the path */
1534 path->lowest_level = 0;
1535 path->keep_locks = 0;
1536 btrfs_release_path(root, path);
1537 return ret;
1538}
1539
1540/*
1393 * this can truncate away extent items, csum items and directory items. 1541 * this can truncate away extent items, csum items and directory items.
1394 * It starts at a high offset and removes keys until it can't find 1542 * It starts at a high offset and removes keys until it can't find
1395 * any higher than new_size 1543 * any higher than new_size
@@ -1436,6 +1584,10 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
1436 key.type = (u8)-1; 1584 key.type = (u8)-1;
1437 1585
1438 btrfs_init_path(path); 1586 btrfs_init_path(path);
1587
1588 ret = drop_csum_leaves(trans, root, path, inode, new_size);
1589 BUG_ON(ret);
1590
1439search_again: 1591search_again:
1440 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1592 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1441 if (ret < 0) { 1593 if (ret < 0) {