diff options
author | Tristan Ye <tristan.ye@oracle.com> | 2010-05-11 05:54:45 -0400 |
---|---|---|
committer | Joel Becker <joel.becker@oracle.com> | 2010-05-18 15:31:05 -0400 |
commit | c1631d4a484fbb498e35d661f1aebd64c86b66bf (patch) | |
tree | 06e951cfebbd616dbaa0017d1c030a3f6e0d8d88 /fs/ocfs2 | |
parent | ee149a7c6cbaee0e3a1a7d9e9f92711228ef5236 (diff) |
Ocfs2: Optimize punching-hole code.
This patch simplifies the logic of handling existing holes and
skipping extent blocks and removes some confusing comments.
The patch survived the fill_verify_holes testcase in ocfs2-test.
It also passed my manual sanity check and stress tests with enormous
extent records.
Currently punching a hole on a file with 3+ extent tree depth was
really a performance disaster. It can even take several hours,
though we may not hit this in real life with such a huge extent
number.
One simple way to improve the performance is quite straightforward.
From the logic of truncate, we can punch the hole from hole_end to
hole_start, which reduces the overhead of btree operations in a
significant way, such as tree rotation and moving.
Following is the testing result when punching hole from 0 to file end
in bytes, on a 1G file, 1G file consists of 256k extent records, each record
cover 4k data(just one cluster, clustersize is 4k):
===========================================================================
* Original punching-hole mechanism:
===========================================================================
I waited 1 hour for its completion, unfortunately it's still ongoing.
===========================================================================
* Patched punching-hode mechanism:
===========================================================================
real 0m2.518s
user 0m0.000s
sys 0m2.445s
That means we've gained up to 1000 times improvement on performance in this
case, whee! It's fairly cool. and it looks like that performance gain will
be raising when extent records grow.
The patch was based on my former 2 patches, which were about truncating
codes optimization and fixup to handle CoW on punching hole.
Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Diffstat (limited to 'fs/ocfs2')
-rw-r--r-- | fs/ocfs2/file.c | 164 |
1 files changed, 140 insertions, 24 deletions
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 3346e5b199d..9c1047c2e44 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -1418,18 +1418,90 @@ out: | |||
1418 | return ret; | 1418 | return ret; |
1419 | } | 1419 | } |
1420 | 1420 | ||
1421 | static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos) | ||
1422 | { | ||
1423 | int i; | ||
1424 | struct ocfs2_extent_rec *rec = NULL; | ||
1425 | |||
1426 | for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { | ||
1427 | |||
1428 | rec = &el->l_recs[i]; | ||
1429 | |||
1430 | if (le32_to_cpu(rec->e_cpos) < pos) | ||
1431 | break; | ||
1432 | } | ||
1433 | |||
1434 | return i; | ||
1435 | } | ||
1436 | |||
1437 | /* | ||
1438 | * Helper to calculate the punching pos and length in one run, we handle the | ||
1439 | * following three cases in order: | ||
1440 | * | ||
1441 | * - remove the entire record | ||
1442 | * - remove a partial record | ||
1443 | * - no record needs to be removed (hole-punching completed) | ||
1444 | */ | ||
1445 | static void ocfs2_calc_trunc_pos(struct inode *inode, | ||
1446 | struct ocfs2_extent_list *el, | ||
1447 | struct ocfs2_extent_rec *rec, | ||
1448 | u32 trunc_start, u32 *trunc_cpos, | ||
1449 | u32 *trunc_len, u32 *trunc_end, | ||
1450 | u64 *blkno, int *done) | ||
1451 | { | ||
1452 | int ret = 0; | ||
1453 | u32 coff, range; | ||
1454 | |||
1455 | range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); | ||
1456 | |||
1457 | if (le32_to_cpu(rec->e_cpos) >= trunc_start) { | ||
1458 | *trunc_cpos = le32_to_cpu(rec->e_cpos); | ||
1459 | /* | ||
1460 | * Skip holes if any. | ||
1461 | */ | ||
1462 | if (range < *trunc_end) | ||
1463 | *trunc_end = range; | ||
1464 | *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos); | ||
1465 | *blkno = le64_to_cpu(rec->e_blkno); | ||
1466 | *trunc_end = le32_to_cpu(rec->e_cpos); | ||
1467 | } else if (range > trunc_start) { | ||
1468 | *trunc_cpos = trunc_start; | ||
1469 | *trunc_len = *trunc_end - trunc_start; | ||
1470 | coff = trunc_start - le32_to_cpu(rec->e_cpos); | ||
1471 | *blkno = le64_to_cpu(rec->e_blkno) + | ||
1472 | ocfs2_clusters_to_blocks(inode->i_sb, coff); | ||
1473 | *trunc_end = trunc_start; | ||
1474 | } else { | ||
1475 | /* | ||
1476 | * It may have two following possibilities: | ||
1477 | * | ||
1478 | * - last record has been removed | ||
1479 | * - trunc_start was within a hole | ||
1480 | * | ||
1481 | * both two cases mean the completion of hole punching. | ||
1482 | */ | ||
1483 | ret = 1; | ||
1484 | } | ||
1485 | |||
1486 | *done = ret; | ||
1487 | } | ||
1488 | |||
1421 | static int ocfs2_remove_inode_range(struct inode *inode, | 1489 | static int ocfs2_remove_inode_range(struct inode *inode, |
1422 | struct buffer_head *di_bh, u64 byte_start, | 1490 | struct buffer_head *di_bh, u64 byte_start, |
1423 | u64 byte_len) | 1491 | u64 byte_len) |
1424 | { | 1492 | { |
1425 | int ret = 0, flags = 0; | 1493 | int ret = 0, flags = 0, done = 0, i; |
1426 | u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; | 1494 | u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos; |
1495 | u32 cluster_in_el; | ||
1427 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1496 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1428 | struct ocfs2_cached_dealloc_ctxt dealloc; | 1497 | struct ocfs2_cached_dealloc_ctxt dealloc; |
1429 | struct address_space *mapping = inode->i_mapping; | 1498 | struct address_space *mapping = inode->i_mapping; |
1430 | struct ocfs2_extent_tree et; | 1499 | struct ocfs2_extent_tree et; |
1500 | struct ocfs2_path *path = NULL; | ||
1501 | struct ocfs2_extent_list *el = NULL; | ||
1502 | struct ocfs2_extent_rec *rec = NULL; | ||
1431 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | 1503 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; |
1432 | u64 refcount_loc = le64_to_cpu(di->i_refcount_loc); | 1504 | u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc); |
1433 | 1505 | ||
1434 | ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); | 1506 | ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); |
1435 | ocfs2_init_dealloc_ctxt(&dealloc); | 1507 | ocfs2_init_dealloc_ctxt(&dealloc); |
@@ -1477,16 +1549,13 @@ static int ocfs2_remove_inode_range(struct inode *inode, | |||
1477 | } | 1549 | } |
1478 | 1550 | ||
1479 | trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); | 1551 | trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); |
1480 | trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; | 1552 | trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits; |
1481 | if (trunc_len >= trunc_start) | 1553 | cluster_in_el = trunc_end; |
1482 | trunc_len -= trunc_start; | ||
1483 | else | ||
1484 | trunc_len = 0; | ||
1485 | 1554 | ||
1486 | mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", | 1555 | mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n", |
1487 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 1556 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
1488 | (unsigned long long)byte_start, | 1557 | (unsigned long long)byte_start, |
1489 | (unsigned long long)byte_len, trunc_start, trunc_len); | 1558 | (unsigned long long)byte_len, trunc_start, trunc_end); |
1490 | 1559 | ||
1491 | ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); | 1560 | ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); |
1492 | if (ret) { | 1561 | if (ret) { |
@@ -1494,32 +1563,79 @@ static int ocfs2_remove_inode_range(struct inode *inode, | |||
1494 | goto out; | 1563 | goto out; |
1495 | } | 1564 | } |
1496 | 1565 | ||
1497 | cpos = trunc_start; | 1566 | path = ocfs2_new_path_from_et(&et); |
1498 | while (trunc_len) { | 1567 | if (!path) { |
1499 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, | 1568 | ret = -ENOMEM; |
1500 | &alloc_size, &flags); | 1569 | mlog_errno(ret); |
1570 | goto out; | ||
1571 | } | ||
1572 | |||
1573 | while (trunc_end > trunc_start) { | ||
1574 | |||
1575 | ret = ocfs2_find_path(INODE_CACHE(inode), path, | ||
1576 | cluster_in_el); | ||
1501 | if (ret) { | 1577 | if (ret) { |
1502 | mlog_errno(ret); | 1578 | mlog_errno(ret); |
1503 | goto out; | 1579 | goto out; |
1504 | } | 1580 | } |
1505 | 1581 | ||
1506 | if (alloc_size > trunc_len) | 1582 | el = path_leaf_el(path); |
1507 | alloc_size = trunc_len; | 1583 | |
1584 | i = ocfs2_find_rec(el, trunc_end); | ||
1585 | /* | ||
1586 | * Need to go to previous extent block. | ||
1587 | */ | ||
1588 | if (i < 0) { | ||
1589 | if (path->p_tree_depth == 0) | ||
1590 | break; | ||
1508 | 1591 | ||
1509 | /* Only do work for non-holes */ | 1592 | ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, |
1510 | if (phys_cpos != 0) { | 1593 | path, |
1511 | ret = ocfs2_remove_btree_range(inode, &et, cpos, | 1594 | &cluster_in_el); |
1512 | phys_cpos, alloc_size, | ||
1513 | flags, &dealloc, | ||
1514 | refcount_loc); | ||
1515 | if (ret) { | 1595 | if (ret) { |
1516 | mlog_errno(ret); | 1596 | mlog_errno(ret); |
1517 | goto out; | 1597 | goto out; |
1518 | } | 1598 | } |
1599 | |||
1600 | /* | ||
1601 | * We've reached the leftmost extent block, | ||
1602 | * it's safe to leave. | ||
1603 | */ | ||
1604 | if (cluster_in_el == 0) | ||
1605 | break; | ||
1606 | |||
1607 | /* | ||
1608 | * The 'pos' searched for previous extent block is | ||
1609 | * always one cluster less than actual trunc_end. | ||
1610 | */ | ||
1611 | trunc_end = cluster_in_el + 1; | ||
1612 | |||
1613 | ocfs2_reinit_path(path, 1); | ||
1614 | |||
1615 | continue; | ||
1616 | |||
1617 | } else | ||
1618 | rec = &el->l_recs[i]; | ||
1619 | |||
1620 | ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos, | ||
1621 | &trunc_len, &trunc_end, &blkno, &done); | ||
1622 | if (done) | ||
1623 | break; | ||
1624 | |||
1625 | flags = rec->e_flags; | ||
1626 | phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); | ||
1627 | |||
1628 | ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos, | ||
1629 | phys_cpos, trunc_len, flags, | ||
1630 | &dealloc, refcount_loc); | ||
1631 | if (ret < 0) { | ||
1632 | mlog_errno(ret); | ||
1633 | goto out; | ||
1519 | } | 1634 | } |
1520 | 1635 | ||
1521 | cpos += alloc_size; | 1636 | cluster_in_el = trunc_end; |
1522 | trunc_len -= alloc_size; | 1637 | |
1638 | ocfs2_reinit_path(path, 1); | ||
1523 | } | 1639 | } |
1524 | 1640 | ||
1525 | ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); | 1641 | ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); |