diff options
author | Chris Mason <chris.mason@oracle.com> | 2008-10-01 19:05:46 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2008-10-01 19:05:46 -0400 |
commit | 323ac95bce442bbde514e3ce57e840402f80d909 (patch) | |
tree | 9b2241ed4de746087fdfb66f83fc142279648f9b /fs/btrfs/inode.c | |
parent | cf749823857230017c86504bfdc70524f929ba96 (diff) |
Btrfs: don't read leaf blocks containing only checksums during truncate
Checksum items take up a significant portion of the metadata for large files.
It is possible to avoid reading them during truncates by checking the keys in
the higher level nodes.
If a given leaf is followed by another leaf where the lowest key is a checksum
item from the same file, we know we can safely delete the leaf without
reading it.
For a 32GB file on a 6 drive raid0 array, Btrfs needs 8s to delete
the file with a cold cache. It is read bound during the run.
With this change, Btrfs is able to delete the file in 0.5s
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 152 |
1 files changed, 152 insertions, 0 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f3abecc2d14c..e5c9261dcbaa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -1390,6 +1390,154 @@ fail: | |||
1390 | } | 1390 | } |
1391 | 1391 | ||
1392 | /* | 1392 | /* |
1393 | * when truncating bytes in a file, it is possible to avoid reading | ||
1394 | * the leaves that contain only checksum items. This can be the | ||
1395 | * majority of the IO required to delete a large file, but it must | ||
1396 | * be done carefully. | ||
1397 | * | ||
1398 | * The keys in the level just above the leaves are checked to make sure | ||
1399 | * the lowest key in a given leaf is a csum key, and starts at an offset | ||
1400 | * after the new size. | ||
1401 | * | ||
1402 | * Then the key for the next leaf is checked to make sure it also has | ||
1403 | * a checksum item for the same file. If it does, we know our target leaf | ||
1404 | * contains only checksum items, and it can be safely freed without reading | ||
1405 | * it. | ||
1406 | * | ||
1407 | * This is just an optimization targeted at large files. It may do | ||
1408 | * nothing. It will return 0 unless things went badly. | ||
1409 | */ | ||
1410 | static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans, | ||
1411 | struct btrfs_root *root, | ||
1412 | struct btrfs_path *path, | ||
1413 | struct inode *inode, u64 new_size) | ||
1414 | { | ||
1415 | struct btrfs_key key; | ||
1416 | int ret; | ||
1417 | int nritems; | ||
1418 | struct btrfs_key found_key; | ||
1419 | struct btrfs_key other_key; | ||
1420 | |||
1421 | path->lowest_level = 1; | ||
1422 | key.objectid = inode->i_ino; | ||
1423 | key.type = BTRFS_CSUM_ITEM_KEY; | ||
1424 | key.offset = new_size; | ||
1425 | again: | ||
1426 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | ||
1427 | if (ret < 0) | ||
1428 | goto out; | ||
1429 | |||
1430 | if (path->nodes[1] == NULL) { | ||
1431 | ret = 0; | ||
1432 | goto out; | ||
1433 | } | ||
1434 | ret = 0; | ||
1435 | btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]); | ||
1436 | nritems = btrfs_header_nritems(path->nodes[1]); | ||
1437 | |||
1438 | if (!nritems) | ||
1439 | goto out; | ||
1440 | |||
1441 | if (path->slots[1] >= nritems) | ||
1442 | goto next_node; | ||
1443 | |||
1444 | /* did we find a key greater than anything we want to delete? */ | ||
1445 | if (found_key.objectid > inode->i_ino || | ||
1446 | (found_key.objectid == inode->i_ino && found_key.type > key.type)) | ||
1447 | goto out; | ||
1448 | |||
1449 | /* we check the next key in the node to make sure the leave contains | ||
1450 | * only checksum items. This comparison doesn't work if our | ||
1451 | * leaf is the last one in the node | ||
1452 | */ | ||
1453 | if (path->slots[1] + 1 >= nritems) { | ||
1454 | next_node: | ||
1455 | /* search forward from the last key in the node, this | ||
1456 | * will bring us into the next node in the tree | ||
1457 | */ | ||
1458 | btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1); | ||
1459 | |||
1460 | /* unlikely, but we inc below, so check to be safe */ | ||
1461 | if (found_key.offset == (u64)-1) | ||
1462 | goto out; | ||
1463 | |||
1464 | /* search_forward needs a path with locks held, do the | ||
1465 | * search again for the original key. It is possible | ||
1466 | * this will race with a balance and return a path that | ||
1467 | * we could modify, but this drop is just an optimization | ||
1468 | * and is allowed to miss some leaves. | ||
1469 | */ | ||
1470 | btrfs_release_path(root, path); | ||
1471 | found_key.offset++; | ||
1472 | |||
1473 | /* setup a max key for search_forward */ | ||
1474 | other_key.offset = (u64)-1; | ||
1475 | other_key.type = key.type; | ||
1476 | other_key.objectid = key.objectid; | ||
1477 | |||
1478 | path->keep_locks = 1; | ||
1479 | ret = btrfs_search_forward(root, &found_key, &other_key, | ||
1480 | path, 0, 0); | ||
1481 | path->keep_locks = 0; | ||
1482 | if (ret || found_key.objectid != key.objectid || | ||
1483 | found_key.type != key.type) { | ||
1484 | ret = 0; | ||
1485 | goto out; | ||
1486 | } | ||
1487 | |||
1488 | key.offset = found_key.offset; | ||
1489 | btrfs_release_path(root, path); | ||
1490 | cond_resched(); | ||
1491 | goto again; | ||
1492 | } | ||
1493 | |||
1494 | /* we know there's one more slot after us in the tree, | ||
1495 | * read that key so we can verify it is also a checksum item | ||
1496 | */ | ||
1497 | btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1); | ||
1498 | |||
1499 | if (found_key.objectid < inode->i_ino) | ||
1500 | goto next_key; | ||
1501 | |||
1502 | if (found_key.type != key.type || found_key.offset < new_size) | ||
1503 | goto next_key; | ||
1504 | |||
1505 | /* | ||
1506 | * if the key for the next leaf isn't a csum key from this objectid, | ||
1507 | * we can't be sure there aren't good items inside this leaf. | ||
1508 | * Bail out | ||
1509 | */ | ||
1510 | if (other_key.objectid != inode->i_ino || other_key.type != key.type) | ||
1511 | goto out; | ||
1512 | |||
1513 | /* | ||
1514 | * it is safe to delete this leaf, it contains only | ||
1515 | * csum items from this inode at an offset >= new_size | ||
1516 | */ | ||
1517 | ret = btrfs_del_leaf(trans, root, path, | ||
1518 | btrfs_node_blockptr(path->nodes[1], | ||
1519 | path->slots[1])); | ||
1520 | BUG_ON(ret); | ||
1521 | |||
1522 | next_key: | ||
1523 | btrfs_release_path(root, path); | ||
1524 | |||
1525 | if (other_key.objectid == inode->i_ino && | ||
1526 | other_key.type == key.type && other_key.offset > key.offset) { | ||
1527 | key.offset = other_key.offset; | ||
1528 | cond_resched(); | ||
1529 | goto again; | ||
1530 | } | ||
1531 | ret = 0; | ||
1532 | out: | ||
1533 | /* fixup any changes we've made to the path */ | ||
1534 | path->lowest_level = 0; | ||
1535 | path->keep_locks = 0; | ||
1536 | btrfs_release_path(root, path); | ||
1537 | return ret; | ||
1538 | } | ||
1539 | |||
1540 | /* | ||
1393 | * this can truncate away extent items, csum items and directory items. | 1541 | * this can truncate away extent items, csum items and directory items. |
1394 | * It starts at a high offset and removes keys until it can't find | 1542 | * It starts at a high offset and removes keys until it can't find |
1395 | * any higher than new_size | 1543 | * any higher than new_size |
@@ -1436,6 +1584,10 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, | |||
1436 | key.type = (u8)-1; | 1584 | key.type = (u8)-1; |
1437 | 1585 | ||
1438 | btrfs_init_path(path); | 1586 | btrfs_init_path(path); |
1587 | |||
1588 | ret = drop_csum_leaves(trans, root, path, inode, new_size); | ||
1589 | BUG_ON(ret); | ||
1590 | |||
1439 | search_again: | 1591 | search_again: |
1440 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | 1592 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); |
1441 | if (ret < 0) { | 1593 | if (ret < 0) { |