diff options
author | Alex Tomas <alex@clusterfs.com> | 2008-07-11 19:27:31 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-07-11 19:27:31 -0400 |
commit | 64769240bd07f446f83660bb143bb609d8ab4910 (patch) | |
tree | db8bd8dda988adab6b4f6087a9b59ce504612009 | |
parent | 29a814d2ee0e43c2980f33f91c1311ec06c0aa35 (diff) |
ext4: Add delayed allocation support in data=writeback mode
Updated with fixes from Mingming Cao <cmm@us.ibm.com> to unlock and
release the page from page cache if the delalloc write_begin failed, and
properly handle preallocated blocks. Also added a fix to clear
buffer_delay in block_write_full_page() after allocating a delayed
buffer.
Updated with fixes from Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
to update i_disksize properly and to add bmap support for delayed
allocation.
Updated with a fix from Valerie Clement <valerie.clement@bull.net> to
avoid filesystem corruption when the filesystem is mounted with the
delalloc option and blocksize < pagesize.
Signed-off-by: Alex Tomas <alex@clusterfs.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
-rw-r--r-- | fs/ext4/ext4.h | 1 | ||||
-rw-r--r-- | fs/ext4/inode.c | 699 | ||||
-rw-r--r-- | fs/ext4/super.c | 6 |
3 files changed, 700 insertions, 6 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f65829bbe7aa..ee9576dc0ba1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -536,6 +536,7 @@ do { \ | |||
536 | #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ | 536 | #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ |
537 | #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ | 537 | #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ |
538 | #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ | 538 | #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ |
539 | #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ | ||
539 | /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ | 540 | /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ |
540 | #ifndef _LINUX_EXT2_FS_H | 541 | #ifndef _LINUX_EXT2_FS_H |
541 | #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt | 542 | #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7b9569179fdf..2bef4f879e4b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/string.h> | 32 | #include <linux/string.h> |
33 | #include <linux/buffer_head.h> | 33 | #include <linux/buffer_head.h> |
34 | #include <linux/writeback.h> | 34 | #include <linux/writeback.h> |
35 | #include <linux/pagevec.h> | ||
35 | #include <linux/mpage.h> | 36 | #include <linux/mpage.h> |
36 | #include <linux/uio.h> | 37 | #include <linux/uio.h> |
37 | #include <linux/bio.h> | 38 | #include <linux/bio.h> |
@@ -46,6 +47,8 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, | |||
46 | new_size); | 47 | new_size); |
47 | } | 48 | } |
48 | 49 | ||
50 | static void ext4_invalidatepage(struct page *page, unsigned long offset); | ||
51 | |||
49 | /* | 52 | /* |
50 | * Test whether an inode is a fast symlink. | 53 | * Test whether an inode is a fast symlink. |
51 | */ | 54 | */ |
@@ -1408,6 +1411,669 @@ static int ext4_journalled_write_end(struct file *file, | |||
1408 | } | 1411 | } |
1409 | 1412 | ||
1410 | /* | 1413 | /* |
1414 | * Delayed allocation stuff | ||
1415 | */ | ||
1416 | |||
1417 | struct mpage_da_data { | ||
1418 | struct inode *inode; | ||
1419 | struct buffer_head lbh; /* extent of blocks */ | ||
1420 | unsigned long first_page, next_page; /* extent of pages */ | ||
1421 | get_block_t *get_block; | ||
1422 | struct writeback_control *wbc; | ||
1423 | }; | ||
1424 | |||
1425 | /* | ||
1426 | * mpage_da_submit_io - walks through extent of pages and try to write | ||
1427 | * them with __mpage_writepage() | ||
1428 | * | ||
1429 | * @mpd->inode: inode | ||
1430 | * @mpd->first_page: first page of the extent | ||
1431 | * @mpd->next_page: page after the last page of the extent | ||
1432 | * @mpd->get_block: the filesystem's block mapper function | ||
1433 | * | ||
1434 | * By the time mpage_da_submit_io() is called we expect all blocks | ||
1435 | * to be allocated. this may be wrong if allocation failed. | ||
1436 | * | ||
1437 | * As pages are already locked by write_cache_pages(), we can't use it | ||
1438 | */ | ||
1439 | static int mpage_da_submit_io(struct mpage_da_data *mpd) | ||
1440 | { | ||
1441 | struct address_space *mapping = mpd->inode->i_mapping; | ||
1442 | struct mpage_data mpd_pp = { | ||
1443 | .bio = NULL, | ||
1444 | .last_block_in_bio = 0, | ||
1445 | .get_block = mpd->get_block, | ||
1446 | .use_writepage = 1, | ||
1447 | }; | ||
1448 | int ret = 0, err, nr_pages, i; | ||
1449 | unsigned long index, end; | ||
1450 | struct pagevec pvec; | ||
1451 | |||
1452 | BUG_ON(mpd->next_page <= mpd->first_page); | ||
1453 | |||
1454 | pagevec_init(&pvec, 0); | ||
1455 | index = mpd->first_page; | ||
1456 | end = mpd->next_page - 1; | ||
1457 | |||
1458 | while (index <= end) { | ||
1459 | /* XXX: optimize tail */ | ||
1460 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); | ||
1461 | if (nr_pages == 0) | ||
1462 | break; | ||
1463 | for (i = 0; i < nr_pages; i++) { | ||
1464 | struct page *page = pvec.pages[i]; | ||
1465 | |||
1466 | index = page->index; | ||
1467 | if (index > end) | ||
1468 | break; | ||
1469 | index++; | ||
1470 | |||
1471 | err = __mpage_writepage(page, mpd->wbc, &mpd_pp); | ||
1472 | |||
1473 | /* | ||
1474 | * In error case, we have to continue because | ||
1475 | * remaining pages are still locked | ||
1476 | * XXX: unlock and re-dirty them? | ||
1477 | */ | ||
1478 | if (ret == 0) | ||
1479 | ret = err; | ||
1480 | } | ||
1481 | pagevec_release(&pvec); | ||
1482 | } | ||
1483 | if (mpd_pp.bio) | ||
1484 | mpage_bio_submit(WRITE, mpd_pp.bio); | ||
1485 | |||
1486 | return ret; | ||
1487 | } | ||
1488 | |||
1489 | /* | ||
1490 | * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers | ||
1491 | * | ||
1492 | * @mpd->inode - inode to walk through | ||
1493 | * @exbh->b_blocknr - first block on a disk | ||
1494 | * @exbh->b_size - amount of space in bytes | ||
1495 | * @logical - first logical block to start assignment with | ||
1496 | * | ||
1497 | * the function goes through all passed space and put actual disk | ||
1498 | * block numbers into buffer heads, dropping BH_Delay | ||
1499 | */ | ||
1500 | static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, | ||
1501 | struct buffer_head *exbh) | ||
1502 | { | ||
1503 | struct inode *inode = mpd->inode; | ||
1504 | struct address_space *mapping = inode->i_mapping; | ||
1505 | int blocks = exbh->b_size >> inode->i_blkbits; | ||
1506 | sector_t pblock = exbh->b_blocknr, cur_logical; | ||
1507 | struct buffer_head *head, *bh; | ||
1508 | unsigned long index, end; | ||
1509 | struct pagevec pvec; | ||
1510 | int nr_pages, i; | ||
1511 | |||
1512 | index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
1513 | end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
1514 | cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
1515 | |||
1516 | pagevec_init(&pvec, 0); | ||
1517 | |||
1518 | while (index <= end) { | ||
1519 | /* XXX: optimize tail */ | ||
1520 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); | ||
1521 | if (nr_pages == 0) | ||
1522 | break; | ||
1523 | for (i = 0; i < nr_pages; i++) { | ||
1524 | struct page *page = pvec.pages[i]; | ||
1525 | |||
1526 | index = page->index; | ||
1527 | if (index > end) | ||
1528 | break; | ||
1529 | index++; | ||
1530 | |||
1531 | BUG_ON(!PageLocked(page)); | ||
1532 | BUG_ON(PageWriteback(page)); | ||
1533 | BUG_ON(!page_has_buffers(page)); | ||
1534 | |||
1535 | bh = page_buffers(page); | ||
1536 | head = bh; | ||
1537 | |||
1538 | /* skip blocks out of the range */ | ||
1539 | do { | ||
1540 | if (cur_logical >= logical) | ||
1541 | break; | ||
1542 | cur_logical++; | ||
1543 | } while ((bh = bh->b_this_page) != head); | ||
1544 | |||
1545 | do { | ||
1546 | if (cur_logical >= logical + blocks) | ||
1547 | break; | ||
1548 | |||
1549 | if (buffer_delay(bh)) { | ||
1550 | bh->b_blocknr = pblock; | ||
1551 | clear_buffer_delay(bh); | ||
1552 | } else if (buffer_mapped(bh)) { | ||
1553 | BUG_ON(bh->b_blocknr != pblock); | ||
1554 | } | ||
1555 | |||
1556 | cur_logical++; | ||
1557 | pblock++; | ||
1558 | } while ((bh = bh->b_this_page) != head); | ||
1559 | } | ||
1560 | pagevec_release(&pvec); | ||
1561 | } | ||
1562 | } | ||
1563 | |||
1564 | |||
1565 | /* | ||
1566 | * __unmap_underlying_blocks - just a helper function to unmap | ||
1567 | * set of blocks described by @bh | ||
1568 | */ | ||
1569 | static inline void __unmap_underlying_blocks(struct inode *inode, | ||
1570 | struct buffer_head *bh) | ||
1571 | { | ||
1572 | struct block_device *bdev = inode->i_sb->s_bdev; | ||
1573 | int blocks, i; | ||
1574 | |||
1575 | blocks = bh->b_size >> inode->i_blkbits; | ||
1576 | for (i = 0; i < blocks; i++) | ||
1577 | unmap_underlying_metadata(bdev, bh->b_blocknr + i); | ||
1578 | } | ||
1579 | |||
1580 | /* | ||
1581 | * mpage_da_map_blocks - go through given space | ||
1582 | * | ||
1583 | * @mpd->lbh - bh describing space | ||
1584 | * @mpd->get_block - the filesystem's block mapper function | ||
1585 | * | ||
1586 | * The function skips space we know is already mapped to disk blocks. | ||
1587 | * | ||
1588 | * The function ignores errors ->get_block() returns, thus real | ||
1589 | * error handling is postponed to __mpage_writepage() | ||
1590 | */ | ||
1591 | static void mpage_da_map_blocks(struct mpage_da_data *mpd) | ||
1592 | { | ||
1593 | struct buffer_head *lbh = &mpd->lbh; | ||
1594 | int err = 0, remain = lbh->b_size; | ||
1595 | sector_t next = lbh->b_blocknr; | ||
1596 | struct buffer_head new; | ||
1597 | |||
1598 | /* | ||
1599 | * We consider only non-mapped and non-allocated blocks | ||
1600 | */ | ||
1601 | if (buffer_mapped(lbh) && !buffer_delay(lbh)) | ||
1602 | return; | ||
1603 | |||
1604 | while (remain) { | ||
1605 | new.b_state = lbh->b_state; | ||
1606 | new.b_blocknr = 0; | ||
1607 | new.b_size = remain; | ||
1608 | err = mpd->get_block(mpd->inode, next, &new, 1); | ||
1609 | if (err) { | ||
1610 | /* | ||
1611 | * Rather than implement own error handling | ||
1612 | * here, we just leave remaining blocks | ||
1613 | * unallocated and try again with ->writepage() | ||
1614 | */ | ||
1615 | break; | ||
1616 | } | ||
1617 | BUG_ON(new.b_size == 0); | ||
1618 | |||
1619 | if (buffer_new(&new)) | ||
1620 | __unmap_underlying_blocks(mpd->inode, &new); | ||
1621 | |||
1622 | /* | ||
1623 | * If blocks are delayed marked, we need to | ||
1624 | * put actual blocknr and drop delayed bit | ||
1625 | */ | ||
1626 | if (buffer_delay(lbh)) | ||
1627 | mpage_put_bnr_to_bhs(mpd, next, &new); | ||
1628 | |||
1629 | /* go for the remaining blocks */ | ||
1630 | next += new.b_size >> mpd->inode->i_blkbits; | ||
1631 | remain -= new.b_size; | ||
1632 | } | ||
1633 | } | ||
1634 | |||
1635 | #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) | ||
1636 | |||
1637 | /* | ||
1638 | * mpage_add_bh_to_extent - try to add one more block to extent of blocks | ||
1639 | * | ||
1640 | * @mpd->lbh - extent of blocks | ||
1641 | * @logical - logical number of the block in the file | ||
1642 | * @bh - bh of the block (used to access block's state) | ||
1643 | * | ||
1644 | * the function is used to collect contig. blocks in same state | ||
1645 | */ | ||
1646 | static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, | ||
1647 | sector_t logical, struct buffer_head *bh) | ||
1648 | { | ||
1649 | struct buffer_head *lbh = &mpd->lbh; | ||
1650 | sector_t next; | ||
1651 | |||
1652 | next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); | ||
1653 | |||
1654 | /* | ||
1655 | * First block in the extent | ||
1656 | */ | ||
1657 | if (lbh->b_size == 0) { | ||
1658 | lbh->b_blocknr = logical; | ||
1659 | lbh->b_size = bh->b_size; | ||
1660 | lbh->b_state = bh->b_state & BH_FLAGS; | ||
1661 | return; | ||
1662 | } | ||
1663 | |||
1664 | /* | ||
1665 | * Can we merge the block to our big extent? | ||
1666 | */ | ||
1667 | if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { | ||
1668 | lbh->b_size += bh->b_size; | ||
1669 | return; | ||
1670 | } | ||
1671 | |||
1672 | /* | ||
1673 | * We couldn't merge the block to our extent, so we | ||
1674 | * need to flush current extent and start new one | ||
1675 | */ | ||
1676 | mpage_da_map_blocks(mpd); | ||
1677 | |||
1678 | /* | ||
1679 | * Now start a new extent | ||
1680 | */ | ||
1681 | lbh->b_size = bh->b_size; | ||
1682 | lbh->b_state = bh->b_state & BH_FLAGS; | ||
1683 | lbh->b_blocknr = logical; | ||
1684 | } | ||
1685 | |||
1686 | /* | ||
1687 | * __mpage_da_writepage - finds extent of pages and blocks | ||
1688 | * | ||
1689 | * @page: page to consider | ||
1690 | * @wbc: not used, we just follow rules | ||
1691 | * @data: context | ||
1692 | * | ||
1693 | * The function finds extents of pages and scan them for all blocks. | ||
1694 | */ | ||
1695 | static int __mpage_da_writepage(struct page *page, | ||
1696 | struct writeback_control *wbc, void *data) | ||
1697 | { | ||
1698 | struct mpage_da_data *mpd = data; | ||
1699 | struct inode *inode = mpd->inode; | ||
1700 | struct buffer_head *bh, *head, fake; | ||
1701 | sector_t logical; | ||
1702 | |||
1703 | /* | ||
1704 | * Can we merge this page to current extent? | ||
1705 | */ | ||
1706 | if (mpd->next_page != page->index) { | ||
1707 | /* | ||
1708 | * Nope, we can't. So, we map non-allocated blocks | ||
1709 | * and start IO on them using __mpage_writepage() | ||
1710 | */ | ||
1711 | if (mpd->next_page != mpd->first_page) { | ||
1712 | mpage_da_map_blocks(mpd); | ||
1713 | mpage_da_submit_io(mpd); | ||
1714 | } | ||
1715 | |||
1716 | /* | ||
1717 | * Start next extent of pages ... | ||
1718 | */ | ||
1719 | mpd->first_page = page->index; | ||
1720 | |||
1721 | /* | ||
1722 | * ... and blocks | ||
1723 | */ | ||
1724 | mpd->lbh.b_size = 0; | ||
1725 | mpd->lbh.b_state = 0; | ||
1726 | mpd->lbh.b_blocknr = 0; | ||
1727 | } | ||
1728 | |||
1729 | mpd->next_page = page->index + 1; | ||
1730 | logical = (sector_t) page->index << | ||
1731 | (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
1732 | |||
1733 | if (!page_has_buffers(page)) { | ||
1734 | /* | ||
1735 | * There is no attached buffer heads yet (mmap?) | ||
1736 | * we treat the page asfull of dirty blocks | ||
1737 | */ | ||
1738 | bh = &fake; | ||
1739 | bh->b_size = PAGE_CACHE_SIZE; | ||
1740 | bh->b_state = 0; | ||
1741 | set_buffer_dirty(bh); | ||
1742 | set_buffer_uptodate(bh); | ||
1743 | mpage_add_bh_to_extent(mpd, logical, bh); | ||
1744 | } else { | ||
1745 | /* | ||
1746 | * Page with regular buffer heads, just add all dirty ones | ||
1747 | */ | ||
1748 | head = page_buffers(page); | ||
1749 | bh = head; | ||
1750 | do { | ||
1751 | BUG_ON(buffer_locked(bh)); | ||
1752 | if (buffer_dirty(bh)) | ||
1753 | mpage_add_bh_to_extent(mpd, logical, bh); | ||
1754 | logical++; | ||
1755 | } while ((bh = bh->b_this_page) != head); | ||
1756 | } | ||
1757 | |||
1758 | return 0; | ||
1759 | } | ||
1760 | |||
1761 | /* | ||
1762 | * mpage_da_writepages - walk the list of dirty pages of the given | ||
1763 | * address space, allocates non-allocated blocks, maps newly-allocated | ||
1764 | * blocks to existing bhs and issue IO them | ||
1765 | * | ||
1766 | * @mapping: address space structure to write | ||
1767 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write | ||
1768 | * @get_block: the filesystem's block mapper function. | ||
1769 | * | ||
1770 | * This is a library function, which implements the writepages() | ||
1771 | * address_space_operation. | ||
1772 | * | ||
1773 | * In order to avoid duplication of logic that deals with partial pages, | ||
1774 | * multiple bio per page, etc, we find non-allocated blocks, allocate | ||
1775 | * them with minimal calls to ->get_block() and re-use __mpage_writepage() | ||
1776 | * | ||
1777 | * It's important that we call __mpage_writepage() only once for each | ||
1778 | * involved page, otherwise we'd have to implement more complicated logic | ||
1779 | * to deal with pages w/o PG_lock or w/ PG_writeback and so on. | ||
1780 | * | ||
1781 | * See comments to mpage_writepages() | ||
1782 | */ | ||
1783 | static int mpage_da_writepages(struct address_space *mapping, | ||
1784 | struct writeback_control *wbc, | ||
1785 | get_block_t get_block) | ||
1786 | { | ||
1787 | struct mpage_da_data mpd; | ||
1788 | int ret; | ||
1789 | |||
1790 | if (!get_block) | ||
1791 | return generic_writepages(mapping, wbc); | ||
1792 | |||
1793 | mpd.wbc = wbc; | ||
1794 | mpd.inode = mapping->host; | ||
1795 | mpd.lbh.b_size = 0; | ||
1796 | mpd.lbh.b_state = 0; | ||
1797 | mpd.lbh.b_blocknr = 0; | ||
1798 | mpd.first_page = 0; | ||
1799 | mpd.next_page = 0; | ||
1800 | mpd.get_block = get_block; | ||
1801 | |||
1802 | ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); | ||
1803 | |||
1804 | /* | ||
1805 | * Handle last extent of pages | ||
1806 | */ | ||
1807 | if (mpd.next_page != mpd.first_page) { | ||
1808 | mpage_da_map_blocks(&mpd); | ||
1809 | mpage_da_submit_io(&mpd); | ||
1810 | } | ||
1811 | |||
1812 | return ret; | ||
1813 | } | ||
1814 | |||
1815 | /* | ||
1816 | * this is a special callback for ->write_begin() only | ||
1817 | * it's intention is to return mapped block or reserve space | ||
1818 | */ | ||
1819 | static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, | ||
1820 | struct buffer_head *bh_result, int create) | ||
1821 | { | ||
1822 | int ret = 0; | ||
1823 | |||
1824 | BUG_ON(create == 0); | ||
1825 | BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); | ||
1826 | |||
1827 | /* | ||
1828 | * first, we need to know whether the block is allocated already | ||
1829 | * preallocated blocks are unmapped but should treated | ||
1830 | * the same as allocated blocks. | ||
1831 | */ | ||
1832 | ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0); | ||
1833 | if (ret == 0) { | ||
1834 | /* the block isn't allocated yet, let's reserve space */ | ||
1835 | /* XXX: call reservation here */ | ||
1836 | /* | ||
1837 | * XXX: __block_prepare_write() unmaps passed block, | ||
1838 | * is it OK? | ||
1839 | */ | ||
1840 | map_bh(bh_result, inode->i_sb, 0); | ||
1841 | set_buffer_new(bh_result); | ||
1842 | set_buffer_delay(bh_result); | ||
1843 | } else if (ret > 0) { | ||
1844 | bh_result->b_size = (ret << inode->i_blkbits); | ||
1845 | ret = 0; | ||
1846 | } | ||
1847 | |||
1848 | return ret; | ||
1849 | } | ||
1850 | |||
1851 | static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, | ||
1852 | struct buffer_head *bh_result, int create) | ||
1853 | { | ||
1854 | int ret, needed_blocks = ext4_writepage_trans_blocks(inode); | ||
1855 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | ||
1856 | loff_t disksize = EXT4_I(inode)->i_disksize; | ||
1857 | handle_t *handle = NULL; | ||
1858 | |||
1859 | if (create) { | ||
1860 | handle = ext4_journal_start(inode, needed_blocks); | ||
1861 | if (IS_ERR(handle)) { | ||
1862 | ret = PTR_ERR(handle); | ||
1863 | goto out; | ||
1864 | } | ||
1865 | } | ||
1866 | |||
1867 | ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, | ||
1868 | bh_result, create, 0); | ||
1869 | if (ret > 0) { | ||
1870 | bh_result->b_size = (ret << inode->i_blkbits); | ||
1871 | |||
1872 | /* | ||
1873 | * Update on-disk size along with block allocation | ||
1874 | * we don't use 'extend_disksize' as size may change | ||
1875 | * within already allocated block -bzzz | ||
1876 | */ | ||
1877 | disksize = ((loff_t) iblock + ret) << inode->i_blkbits; | ||
1878 | if (disksize > i_size_read(inode)) | ||
1879 | disksize = i_size_read(inode); | ||
1880 | if (disksize > EXT4_I(inode)->i_disksize) { | ||
1881 | /* | ||
1882 | * XXX: replace with spinlock if seen contended -bzzz | ||
1883 | */ | ||
1884 | down_write(&EXT4_I(inode)->i_data_sem); | ||
1885 | if (disksize > EXT4_I(inode)->i_disksize) | ||
1886 | EXT4_I(inode)->i_disksize = disksize; | ||
1887 | up_write(&EXT4_I(inode)->i_data_sem); | ||
1888 | |||
1889 | if (EXT4_I(inode)->i_disksize == disksize) { | ||
1890 | if (handle == NULL) | ||
1891 | handle = ext4_journal_start(inode, 1); | ||
1892 | if (!IS_ERR(handle)) | ||
1893 | ext4_mark_inode_dirty(handle, inode); | ||
1894 | } | ||
1895 | } | ||
1896 | |||
1897 | ret = 0; | ||
1898 | } | ||
1899 | |||
1900 | out: | ||
1901 | if (handle && !IS_ERR(handle)) | ||
1902 | ext4_journal_stop(handle); | ||
1903 | |||
1904 | return ret; | ||
1905 | } | ||
1906 | /* FIXME!! only support data=writeback mode */ | ||
1907 | static int ext4_da_writepage(struct page *page, | ||
1908 | struct writeback_control *wbc) | ||
1909 | { | ||
1910 | struct inode *inode = page->mapping->host; | ||
1911 | handle_t *handle = NULL; | ||
1912 | int ret = 0; | ||
1913 | int err; | ||
1914 | |||
1915 | if (ext4_journal_current_handle()) | ||
1916 | goto out_fail; | ||
1917 | |||
1918 | handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); | ||
1919 | if (IS_ERR(handle)) { | ||
1920 | ret = PTR_ERR(handle); | ||
1921 | goto out_fail; | ||
1922 | } | ||
1923 | |||
1924 | if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) | ||
1925 | ret = nobh_writepage(page, ext4_get_block, wbc); | ||
1926 | else | ||
1927 | ret = block_write_full_page(page, ext4_get_block, wbc); | ||
1928 | |||
1929 | if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) { | ||
1930 | EXT4_I(inode)->i_disksize = inode->i_size; | ||
1931 | ext4_mark_inode_dirty(handle, inode); | ||
1932 | } | ||
1933 | |||
1934 | err = ext4_journal_stop(handle); | ||
1935 | if (!ret) | ||
1936 | ret = err; | ||
1937 | return ret; | ||
1938 | |||
1939 | out_fail: | ||
1940 | redirty_page_for_writepage(wbc, page); | ||
1941 | unlock_page(page); | ||
1942 | return ret; | ||
1943 | } | ||
1944 | |||
1945 | static int ext4_da_writepages(struct address_space *mapping, | ||
1946 | struct writeback_control *wbc) | ||
1947 | { | ||
1948 | return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write); | ||
1949 | } | ||
1950 | |||
1951 | static int ext4_da_write_begin(struct file *file, struct address_space *mapping, | ||
1952 | loff_t pos, unsigned len, unsigned flags, | ||
1953 | struct page **pagep, void **fsdata) | ||
1954 | { | ||
1955 | int ret; | ||
1956 | struct page *page; | ||
1957 | pgoff_t index; | ||
1958 | unsigned from, to; | ||
1959 | struct inode *inode = mapping->host; | ||
1960 | handle_t *handle; | ||
1961 | |||
1962 | index = pos >> PAGE_CACHE_SHIFT; | ||
1963 | from = pos & (PAGE_CACHE_SIZE - 1); | ||
1964 | to = from + len; | ||
1965 | |||
1966 | /* | ||
1967 | * With delayed allocation, we don't log the i_disksize update | ||
1968 | * if there is delayed block allocation. But we still need | ||
1969 | * to journalling the i_disksize update if writes to the end | ||
1970 | * of file which has an already mapped buffer. | ||
1971 | */ | ||
1972 | handle = ext4_journal_start(inode, 1); | ||
1973 | if (IS_ERR(handle)) { | ||
1974 | ret = PTR_ERR(handle); | ||
1975 | goto out; | ||
1976 | } | ||
1977 | |||
1978 | page = __grab_cache_page(mapping, index); | ||
1979 | if (!page) | ||
1980 | return -ENOMEM; | ||
1981 | *pagep = page; | ||
1982 | |||
1983 | ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, | ||
1984 | ext4_da_get_block_prep); | ||
1985 | if (ret < 0) { | ||
1986 | unlock_page(page); | ||
1987 | ext4_journal_stop(handle); | ||
1988 | page_cache_release(page); | ||
1989 | } | ||
1990 | |||
1991 | out: | ||
1992 | return ret; | ||
1993 | } | ||
1994 | |||
1995 | static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) | ||
1996 | { | ||
1997 | return !buffer_mapped(bh) || buffer_delay(bh); | ||
1998 | } | ||
1999 | |||
2000 | static int ext4_da_write_end(struct file *file, | ||
2001 | struct address_space *mapping, | ||
2002 | loff_t pos, unsigned len, unsigned copied, | ||
2003 | struct page *page, void *fsdata) | ||
2004 | { | ||
2005 | struct inode *inode = mapping->host; | ||
2006 | int ret = 0, ret2; | ||
2007 | handle_t *handle = ext4_journal_current_handle(); | ||
2008 | loff_t new_i_size; | ||
2009 | |||
2010 | /* | ||
2011 | * generic_write_end() will run mark_inode_dirty() if i_size | ||
2012 | * changes. So let's piggyback the i_disksize mark_inode_dirty | ||
2013 | * into that. | ||
2014 | */ | ||
2015 | |||
2016 | new_i_size = pos + copied; | ||
2017 | if (new_i_size > EXT4_I(inode)->i_disksize) | ||
2018 | if (!walk_page_buffers(NULL, page_buffers(page), | ||
2019 | 0, len, NULL, ext4_bh_unmapped_or_delay)){ | ||
2020 | /* | ||
2021 | * Updating i_disksize when extending file without | ||
2022 | * needing block allocation | ||
2023 | */ | ||
2024 | if (ext4_should_order_data(inode)) | ||
2025 | ret = ext4_jbd2_file_inode(handle, inode); | ||
2026 | |||
2027 | EXT4_I(inode)->i_disksize = new_i_size; | ||
2028 | } | ||
2029 | ret2 = generic_write_end(file, mapping, pos, len, copied, | ||
2030 | page, fsdata); | ||
2031 | copied = ret2; | ||
2032 | if (ret2 < 0) | ||
2033 | ret = ret2; | ||
2034 | ret2 = ext4_journal_stop(handle); | ||
2035 | if (!ret) | ||
2036 | ret = ret2; | ||
2037 | |||
2038 | return ret ? ret : copied; | ||
2039 | } | ||
2040 | |||
2041 | static void ext4_da_invalidatepage(struct page *page, unsigned long offset) | ||
2042 | { | ||
2043 | struct buffer_head *head, *bh; | ||
2044 | unsigned int curr_off = 0; | ||
2045 | |||
2046 | /* | ||
2047 | * Drop reserved blocks | ||
2048 | */ | ||
2049 | BUG_ON(!PageLocked(page)); | ||
2050 | if (!page_has_buffers(page)) | ||
2051 | goto out; | ||
2052 | |||
2053 | head = page_buffers(page); | ||
2054 | bh = head; | ||
2055 | do { | ||
2056 | unsigned int next_off = curr_off + bh->b_size; | ||
2057 | |||
2058 | /* | ||
2059 | * is this block fully invalidated? | ||
2060 | */ | ||
2061 | if (offset <= curr_off && buffer_delay(bh)) { | ||
2062 | clear_buffer_delay(bh); | ||
2063 | /* XXX: add real stuff here */ | ||
2064 | } | ||
2065 | curr_off = next_off; | ||
2066 | bh = bh->b_this_page; | ||
2067 | } while (bh != head); | ||
2068 | |||
2069 | out: | ||
2070 | ext4_invalidatepage(page, offset); | ||
2071 | |||
2072 | return; | ||
2073 | } | ||
2074 | |||
2075 | |||
2076 | /* | ||
1411 | * bmap() is special. It gets used by applications such as lilo and by | 2077 | * bmap() is special. It gets used by applications such as lilo and by |
1412 | * the swapper to find the on-disk block of a specific piece of data. | 2078 | * the swapper to find the on-disk block of a specific piece of data. |
1413 | * | 2079 | * |
@@ -1427,6 +2093,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) | |||
1427 | journal_t *journal; | 2093 | journal_t *journal; |
1428 | int err; | 2094 | int err; |
1429 | 2095 | ||
2096 | if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && | ||
2097 | test_opt(inode->i_sb, DELALLOC)) { | ||
2098 | /* | ||
2099 | * With delalloc we want to sync the file | ||
2100 | * so that we can make sure we allocate | ||
2101 | * blocks for file | ||
2102 | */ | ||
2103 | filemap_write_and_wait(mapping); | ||
2104 | } | ||
2105 | |||
1430 | if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { | 2106 | if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { |
1431 | /* | 2107 | /* |
1432 | * This is a REALLY heavyweight approach, but the use of | 2108 | * This is a REALLY heavyweight approach, but the use of |
@@ -1471,11 +2147,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) | |||
1471 | return 0; | 2147 | return 0; |
1472 | } | 2148 | } |
1473 | 2149 | ||
1474 | static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) | ||
1475 | { | ||
1476 | return !buffer_mapped(bh) || buffer_delay(bh); | ||
1477 | } | ||
1478 | |||
1479 | /* | 2150 | /* |
1480 | * Note that we don't need to start a transaction unless we're journaling data | 2151 | * Note that we don't need to start a transaction unless we're journaling data |
1481 | * because we should have holes filled from ext4_page_mkwrite(). We even don't | 2152 | * because we should have holes filled from ext4_page_mkwrite(). We even don't |
@@ -1832,10 +2503,28 @@ static const struct address_space_operations ext4_journalled_aops = { | |||
1832 | .releasepage = ext4_releasepage, | 2503 | .releasepage = ext4_releasepage, |
1833 | }; | 2504 | }; |
1834 | 2505 | ||
2506 | static const struct address_space_operations ext4_da_aops = { | ||
2507 | .readpage = ext4_readpage, | ||
2508 | .readpages = ext4_readpages, | ||
2509 | .writepage = ext4_da_writepage, | ||
2510 | .writepages = ext4_da_writepages, | ||
2511 | .sync_page = block_sync_page, | ||
2512 | .write_begin = ext4_da_write_begin, | ||
2513 | .write_end = ext4_da_write_end, | ||
2514 | .bmap = ext4_bmap, | ||
2515 | .invalidatepage = ext4_da_invalidatepage, | ||
2516 | .releasepage = ext4_releasepage, | ||
2517 | .direct_IO = ext4_direct_IO, | ||
2518 | .migratepage = buffer_migrate_page, | ||
2519 | }; | ||
2520 | |||
1835 | void ext4_set_aops(struct inode *inode) | 2521 | void ext4_set_aops(struct inode *inode) |
1836 | { | 2522 | { |
1837 | if (ext4_should_order_data(inode)) | 2523 | if (ext4_should_order_data(inode)) |
1838 | inode->i_mapping->a_ops = &ext4_ordered_aops; | 2524 | inode->i_mapping->a_ops = &ext4_ordered_aops; |
2525 | else if (ext4_should_writeback_data(inode) && | ||
2526 | test_opt(inode->i_sb, DELALLOC)) | ||
2527 | inode->i_mapping->a_ops = &ext4_da_aops; | ||
1839 | else if (ext4_should_writeback_data(inode)) | 2528 | else if (ext4_should_writeback_data(inode)) |
1840 | inode->i_mapping->a_ops = &ext4_writeback_aops; | 2529 | inode->i_mapping->a_ops = &ext4_writeback_aops; |
1841 | else | 2530 | else |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 629d0fa27e3a..de9d3d0eb203 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -898,7 +898,7 @@ enum { | |||
898 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, | 898 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, |
899 | Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, | 899 | Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, |
900 | Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, | 900 | Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, |
901 | Opt_mballoc, Opt_nomballoc, Opt_stripe, | 901 | Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, |
902 | }; | 902 | }; |
903 | 903 | ||
904 | static match_table_t tokens = { | 904 | static match_table_t tokens = { |
@@ -957,6 +957,7 @@ static match_table_t tokens = { | |||
957 | {Opt_nomballoc, "nomballoc"}, | 957 | {Opt_nomballoc, "nomballoc"}, |
958 | {Opt_stripe, "stripe=%u"}, | 958 | {Opt_stripe, "stripe=%u"}, |
959 | {Opt_resize, "resize"}, | 959 | {Opt_resize, "resize"}, |
960 | {Opt_delalloc, "delalloc"}, | ||
960 | {Opt_err, NULL}, | 961 | {Opt_err, NULL}, |
961 | }; | 962 | }; |
962 | 963 | ||
@@ -1335,6 +1336,9 @@ set_qf_format: | |||
1335 | return 0; | 1336 | return 0; |
1336 | sbi->s_stripe = option; | 1337 | sbi->s_stripe = option; |
1337 | break; | 1338 | break; |
1339 | case Opt_delalloc: | ||
1340 | set_opt(sbi->s_mount_opt, DELALLOC); | ||
1341 | break; | ||
1338 | default: | 1342 | default: |
1339 | printk (KERN_ERR | 1343 | printk (KERN_ERR |
1340 | "EXT4-fs: Unrecognized mount option \"%s\" " | 1344 | "EXT4-fs: Unrecognized mount option \"%s\" " |