aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
authorAlex Tomas <alex@clusterfs.com>2008-07-11 19:27:31 -0400
committerTheodore Ts'o <tytso@mit.edu>2008-07-11 19:27:31 -0400
commit64769240bd07f446f83660bb143bb609d8ab4910 (patch)
treedb8bd8dda988adab6b4f6087a9b59ce504612009 /fs/ext4
parent29a814d2ee0e43c2980f33f91c1311ec06c0aa35 (diff)
ext4: Add delayed allocation support in data=writeback mode
Updated with fixes from Mingming Cao <cmm@us.ibm.com> to unlock and release the page from page cache if the delalloc write_begin failed, and properly handle preallocated blocks. Also added a fix to clear buffer_delay in block_write_full_page() after allocating a delayed buffer. Updated with fixes from Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> to update i_disksize properly and to add bmap support for delayed allocation. Updated with a fix from Valerie Clement <valerie.clement@bull.net> to avoid filesystem corruption when the filesystem is mounted with the delalloc option and blocksize < pagesize. Signed-off-by: Alex Tomas <alex@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/inode.c699
-rw-r--r--fs/ext4/super.c6
3 files changed, 700 insertions, 6 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f65829bbe7aa..ee9576dc0ba1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -536,6 +536,7 @@ do { \
536#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 536#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
537#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 537#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
538#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ 538#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
539#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
539/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ 540/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
540#ifndef _LINUX_EXT2_FS_H 541#ifndef _LINUX_EXT2_FS_H
541#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 542#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7b9569179fdf..2bef4f879e4b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,6 +32,7 @@
32#include <linux/string.h> 32#include <linux/string.h>
33#include <linux/buffer_head.h> 33#include <linux/buffer_head.h>
34#include <linux/writeback.h> 34#include <linux/writeback.h>
35#include <linux/pagevec.h>
35#include <linux/mpage.h> 36#include <linux/mpage.h>
36#include <linux/uio.h> 37#include <linux/uio.h>
37#include <linux/bio.h> 38#include <linux/bio.h>
@@ -46,6 +47,8 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
46 new_size); 47 new_size);
47} 48}
48 49
50static void ext4_invalidatepage(struct page *page, unsigned long offset);
51
49/* 52/*
50 * Test whether an inode is a fast symlink. 53 * Test whether an inode is a fast symlink.
51 */ 54 */
@@ -1408,6 +1411,669 @@ static int ext4_journalled_write_end(struct file *file,
1408} 1411}
1409 1412
1410/* 1413/*
1414 * Delayed allocation stuff
1415 */
1416
1417struct mpage_da_data {
1418 struct inode *inode;
1419 struct buffer_head lbh; /* extent of blocks */
1420 unsigned long first_page, next_page; /* extent of pages */
1421 get_block_t *get_block;
1422 struct writeback_control *wbc;
1423};
1424
1425/*
1426 * mpage_da_submit_io - walks through extent of pages and try to write
1427 * them with __mpage_writepage()
1428 *
1429 * @mpd->inode: inode
1430 * @mpd->first_page: first page of the extent
1431 * @mpd->next_page: page after the last page of the extent
1432 * @mpd->get_block: the filesystem's block mapper function
1433 *
1434 * By the time mpage_da_submit_io() is called we expect all blocks
1435 * to be allocated. this may be wrong if allocation failed.
1436 *
1437 * As pages are already locked by write_cache_pages(), we can't use it
1438 */
1439static int mpage_da_submit_io(struct mpage_da_data *mpd)
1440{
1441 struct address_space *mapping = mpd->inode->i_mapping;
1442 struct mpage_data mpd_pp = {
1443 .bio = NULL,
1444 .last_block_in_bio = 0,
1445 .get_block = mpd->get_block,
1446 .use_writepage = 1,
1447 };
1448 int ret = 0, err, nr_pages, i;
1449 unsigned long index, end;
1450 struct pagevec pvec;
1451
1452 BUG_ON(mpd->next_page <= mpd->first_page);
1453
1454 pagevec_init(&pvec, 0);
1455 index = mpd->first_page;
1456 end = mpd->next_page - 1;
1457
1458 while (index <= end) {
1459 /* XXX: optimize tail */
1460 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1461 if (nr_pages == 0)
1462 break;
1463 for (i = 0; i < nr_pages; i++) {
1464 struct page *page = pvec.pages[i];
1465
1466 index = page->index;
1467 if (index > end)
1468 break;
1469 index++;
1470
1471 err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
1472
1473 /*
1474 * In error case, we have to continue because
1475 * remaining pages are still locked
1476 * XXX: unlock and re-dirty them?
1477 */
1478 if (ret == 0)
1479 ret = err;
1480 }
1481 pagevec_release(&pvec);
1482 }
1483 if (mpd_pp.bio)
1484 mpage_bio_submit(WRITE, mpd_pp.bio);
1485
1486 return ret;
1487}
1488
1489/*
1490 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
1491 *
1492 * @mpd->inode - inode to walk through
1493 * @exbh->b_blocknr - first block on a disk
1494 * @exbh->b_size - amount of space in bytes
1495 * @logical - first logical block to start assignment with
1496 *
1497 * the function goes through all passed space and put actual disk
1498 * block numbers into buffer heads, dropping BH_Delay
1499 */
1500static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1501 struct buffer_head *exbh)
1502{
1503 struct inode *inode = mpd->inode;
1504 struct address_space *mapping = inode->i_mapping;
1505 int blocks = exbh->b_size >> inode->i_blkbits;
1506 sector_t pblock = exbh->b_blocknr, cur_logical;
1507 struct buffer_head *head, *bh;
1508 unsigned long index, end;
1509 struct pagevec pvec;
1510 int nr_pages, i;
1511
1512 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1513 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1514 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1515
1516 pagevec_init(&pvec, 0);
1517
1518 while (index <= end) {
1519 /* XXX: optimize tail */
1520 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1521 if (nr_pages == 0)
1522 break;
1523 for (i = 0; i < nr_pages; i++) {
1524 struct page *page = pvec.pages[i];
1525
1526 index = page->index;
1527 if (index > end)
1528 break;
1529 index++;
1530
1531 BUG_ON(!PageLocked(page));
1532 BUG_ON(PageWriteback(page));
1533 BUG_ON(!page_has_buffers(page));
1534
1535 bh = page_buffers(page);
1536 head = bh;
1537
1538 /* skip blocks out of the range */
1539 do {
1540 if (cur_logical >= logical)
1541 break;
1542 cur_logical++;
1543 } while ((bh = bh->b_this_page) != head);
1544
1545 do {
1546 if (cur_logical >= logical + blocks)
1547 break;
1548
1549 if (buffer_delay(bh)) {
1550 bh->b_blocknr = pblock;
1551 clear_buffer_delay(bh);
1552 } else if (buffer_mapped(bh)) {
1553 BUG_ON(bh->b_blocknr != pblock);
1554 }
1555
1556 cur_logical++;
1557 pblock++;
1558 } while ((bh = bh->b_this_page) != head);
1559 }
1560 pagevec_release(&pvec);
1561 }
1562}
1563
1564
1565/*
1566 * __unmap_underlying_blocks - just a helper function to unmap
1567 * set of blocks described by @bh
1568 */
1569static inline void __unmap_underlying_blocks(struct inode *inode,
1570 struct buffer_head *bh)
1571{
1572 struct block_device *bdev = inode->i_sb->s_bdev;
1573 int blocks, i;
1574
1575 blocks = bh->b_size >> inode->i_blkbits;
1576 for (i = 0; i < blocks; i++)
1577 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
1578}
1579
1580/*
1581 * mpage_da_map_blocks - go through given space
1582 *
1583 * @mpd->lbh - bh describing space
1584 * @mpd->get_block - the filesystem's block mapper function
1585 *
1586 * The function skips space we know is already mapped to disk blocks.
1587 *
1588 * The function ignores errors ->get_block() returns, thus real
1589 * error handling is postponed to __mpage_writepage()
1590 */
1591static void mpage_da_map_blocks(struct mpage_da_data *mpd)
1592{
1593 struct buffer_head *lbh = &mpd->lbh;
1594 int err = 0, remain = lbh->b_size;
1595 sector_t next = lbh->b_blocknr;
1596 struct buffer_head new;
1597
1598 /*
1599 * We consider only non-mapped and non-allocated blocks
1600 */
1601 if (buffer_mapped(lbh) && !buffer_delay(lbh))
1602 return;
1603
1604 while (remain) {
1605 new.b_state = lbh->b_state;
1606 new.b_blocknr = 0;
1607 new.b_size = remain;
1608 err = mpd->get_block(mpd->inode, next, &new, 1);
1609 if (err) {
1610 /*
1611 * Rather than implement own error handling
1612 * here, we just leave remaining blocks
1613 * unallocated and try again with ->writepage()
1614 */
1615 break;
1616 }
1617 BUG_ON(new.b_size == 0);
1618
1619 if (buffer_new(&new))
1620 __unmap_underlying_blocks(mpd->inode, &new);
1621
1622 /*
1623 * If blocks are delayed marked, we need to
1624 * put actual blocknr and drop delayed bit
1625 */
1626 if (buffer_delay(lbh))
1627 mpage_put_bnr_to_bhs(mpd, next, &new);
1628
1629 /* go for the remaining blocks */
1630 next += new.b_size >> mpd->inode->i_blkbits;
1631 remain -= new.b_size;
1632 }
1633}
1634
1635#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
1636
1637/*
1638 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1639 *
1640 * @mpd->lbh - extent of blocks
1641 * @logical - logical number of the block in the file
1642 * @bh - bh of the block (used to access block's state)
1643 *
1644 * the function is used to collect contig. blocks in same state
1645 */
1646static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1647 sector_t logical, struct buffer_head *bh)
1648{
1649 struct buffer_head *lbh = &mpd->lbh;
1650 sector_t next;
1651
1652 next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
1653
1654 /*
1655 * First block in the extent
1656 */
1657 if (lbh->b_size == 0) {
1658 lbh->b_blocknr = logical;
1659 lbh->b_size = bh->b_size;
1660 lbh->b_state = bh->b_state & BH_FLAGS;
1661 return;
1662 }
1663
1664 /*
1665 * Can we merge the block to our big extent?
1666 */
1667 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
1668 lbh->b_size += bh->b_size;
1669 return;
1670 }
1671
1672 /*
1673 * We couldn't merge the block to our extent, so we
1674 * need to flush current extent and start new one
1675 */
1676 mpage_da_map_blocks(mpd);
1677
1678 /*
1679 * Now start a new extent
1680 */
1681 lbh->b_size = bh->b_size;
1682 lbh->b_state = bh->b_state & BH_FLAGS;
1683 lbh->b_blocknr = logical;
1684}
1685
1686/*
1687 * __mpage_da_writepage - finds extent of pages and blocks
1688 *
1689 * @page: page to consider
1690 * @wbc: not used, we just follow rules
1691 * @data: context
1692 *
1693 * The function finds extents of pages and scan them for all blocks.
1694 */
1695static int __mpage_da_writepage(struct page *page,
1696 struct writeback_control *wbc, void *data)
1697{
1698 struct mpage_da_data *mpd = data;
1699 struct inode *inode = mpd->inode;
1700 struct buffer_head *bh, *head, fake;
1701 sector_t logical;
1702
1703 /*
1704 * Can we merge this page to current extent?
1705 */
1706 if (mpd->next_page != page->index) {
1707 /*
1708 * Nope, we can't. So, we map non-allocated blocks
1709 * and start IO on them using __mpage_writepage()
1710 */
1711 if (mpd->next_page != mpd->first_page) {
1712 mpage_da_map_blocks(mpd);
1713 mpage_da_submit_io(mpd);
1714 }
1715
1716 /*
1717 * Start next extent of pages ...
1718 */
1719 mpd->first_page = page->index;
1720
1721 /*
1722 * ... and blocks
1723 */
1724 mpd->lbh.b_size = 0;
1725 mpd->lbh.b_state = 0;
1726 mpd->lbh.b_blocknr = 0;
1727 }
1728
1729 mpd->next_page = page->index + 1;
1730 logical = (sector_t) page->index <<
1731 (PAGE_CACHE_SHIFT - inode->i_blkbits);
1732
1733 if (!page_has_buffers(page)) {
1734 /*
1735 * There is no attached buffer heads yet (mmap?)
1736 * we treat the page asfull of dirty blocks
1737 */
1738 bh = &fake;
1739 bh->b_size = PAGE_CACHE_SIZE;
1740 bh->b_state = 0;
1741 set_buffer_dirty(bh);
1742 set_buffer_uptodate(bh);
1743 mpage_add_bh_to_extent(mpd, logical, bh);
1744 } else {
1745 /*
1746 * Page with regular buffer heads, just add all dirty ones
1747 */
1748 head = page_buffers(page);
1749 bh = head;
1750 do {
1751 BUG_ON(buffer_locked(bh));
1752 if (buffer_dirty(bh))
1753 mpage_add_bh_to_extent(mpd, logical, bh);
1754 logical++;
1755 } while ((bh = bh->b_this_page) != head);
1756 }
1757
1758 return 0;
1759}
1760
1761/*
1762 * mpage_da_writepages - walk the list of dirty pages of the given
1763 * address space, allocates non-allocated blocks, maps newly-allocated
1764 * blocks to existing bhs and issue IO them
1765 *
1766 * @mapping: address space structure to write
1767 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
1768 * @get_block: the filesystem's block mapper function.
1769 *
1770 * This is a library function, which implements the writepages()
1771 * address_space_operation.
1772 *
1773 * In order to avoid duplication of logic that deals with partial pages,
1774 * multiple bio per page, etc, we find non-allocated blocks, allocate
1775 * them with minimal calls to ->get_block() and re-use __mpage_writepage()
1776 *
1777 * It's important that we call __mpage_writepage() only once for each
1778 * involved page, otherwise we'd have to implement more complicated logic
1779 * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
1780 *
1781 * See comments to mpage_writepages()
1782 */
1783static int mpage_da_writepages(struct address_space *mapping,
1784 struct writeback_control *wbc,
1785 get_block_t get_block)
1786{
1787 struct mpage_da_data mpd;
1788 int ret;
1789
1790 if (!get_block)
1791 return generic_writepages(mapping, wbc);
1792
1793 mpd.wbc = wbc;
1794 mpd.inode = mapping->host;
1795 mpd.lbh.b_size = 0;
1796 mpd.lbh.b_state = 0;
1797 mpd.lbh.b_blocknr = 0;
1798 mpd.first_page = 0;
1799 mpd.next_page = 0;
1800 mpd.get_block = get_block;
1801
1802 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
1803
1804 /*
1805 * Handle last extent of pages
1806 */
1807 if (mpd.next_page != mpd.first_page) {
1808 mpage_da_map_blocks(&mpd);
1809 mpage_da_submit_io(&mpd);
1810 }
1811
1812 return ret;
1813}
1814
1815/*
1816 * this is a special callback for ->write_begin() only
1817 * it's intention is to return mapped block or reserve space
1818 */
1819static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1820 struct buffer_head *bh_result, int create)
1821{
1822 int ret = 0;
1823
1824 BUG_ON(create == 0);
1825 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
1826
1827 /*
1828 * first, we need to know whether the block is allocated already
1829 * preallocated blocks are unmapped but should treated
1830 * the same as allocated blocks.
1831 */
1832 ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0);
1833 if (ret == 0) {
1834 /* the block isn't allocated yet, let's reserve space */
1835 /* XXX: call reservation here */
1836 /*
1837 * XXX: __block_prepare_write() unmaps passed block,
1838 * is it OK?
1839 */
1840 map_bh(bh_result, inode->i_sb, 0);
1841 set_buffer_new(bh_result);
1842 set_buffer_delay(bh_result);
1843 } else if (ret > 0) {
1844 bh_result->b_size = (ret << inode->i_blkbits);
1845 ret = 0;
1846 }
1847
1848 return ret;
1849}
1850
1851static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
1852 struct buffer_head *bh_result, int create)
1853{
1854 int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
1855 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1856 loff_t disksize = EXT4_I(inode)->i_disksize;
1857 handle_t *handle = NULL;
1858
1859 if (create) {
1860 handle = ext4_journal_start(inode, needed_blocks);
1861 if (IS_ERR(handle)) {
1862 ret = PTR_ERR(handle);
1863 goto out;
1864 }
1865 }
1866
1867 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
1868 bh_result, create, 0);
1869 if (ret > 0) {
1870 bh_result->b_size = (ret << inode->i_blkbits);
1871
1872 /*
1873 * Update on-disk size along with block allocation
1874 * we don't use 'extend_disksize' as size may change
1875 * within already allocated block -bzzz
1876 */
1877 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
1878 if (disksize > i_size_read(inode))
1879 disksize = i_size_read(inode);
1880 if (disksize > EXT4_I(inode)->i_disksize) {
1881 /*
1882 * XXX: replace with spinlock if seen contended -bzzz
1883 */
1884 down_write(&EXT4_I(inode)->i_data_sem);
1885 if (disksize > EXT4_I(inode)->i_disksize)
1886 EXT4_I(inode)->i_disksize = disksize;
1887 up_write(&EXT4_I(inode)->i_data_sem);
1888
1889 if (EXT4_I(inode)->i_disksize == disksize) {
1890 if (handle == NULL)
1891 handle = ext4_journal_start(inode, 1);
1892 if (!IS_ERR(handle))
1893 ext4_mark_inode_dirty(handle, inode);
1894 }
1895 }
1896
1897 ret = 0;
1898 }
1899
1900out:
1901 if (handle && !IS_ERR(handle))
1902 ext4_journal_stop(handle);
1903
1904 return ret;
1905}
1906/* FIXME!! only support data=writeback mode */
1907static int ext4_da_writepage(struct page *page,
1908 struct writeback_control *wbc)
1909{
1910 struct inode *inode = page->mapping->host;
1911 handle_t *handle = NULL;
1912 int ret = 0;
1913 int err;
1914
1915 if (ext4_journal_current_handle())
1916 goto out_fail;
1917
1918 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1919 if (IS_ERR(handle)) {
1920 ret = PTR_ERR(handle);
1921 goto out_fail;
1922 }
1923
1924 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
1925 ret = nobh_writepage(page, ext4_get_block, wbc);
1926 else
1927 ret = block_write_full_page(page, ext4_get_block, wbc);
1928
1929 if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
1930 EXT4_I(inode)->i_disksize = inode->i_size;
1931 ext4_mark_inode_dirty(handle, inode);
1932 }
1933
1934 err = ext4_journal_stop(handle);
1935 if (!ret)
1936 ret = err;
1937 return ret;
1938
1939out_fail:
1940 redirty_page_for_writepage(wbc, page);
1941 unlock_page(page);
1942 return ret;
1943}
1944
1945static int ext4_da_writepages(struct address_space *mapping,
1946 struct writeback_control *wbc)
1947{
1948 return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
1949}
1950
1951static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
1952 loff_t pos, unsigned len, unsigned flags,
1953 struct page **pagep, void **fsdata)
1954{
1955 int ret;
1956 struct page *page;
1957 pgoff_t index;
1958 unsigned from, to;
1959 struct inode *inode = mapping->host;
1960 handle_t *handle;
1961
1962 index = pos >> PAGE_CACHE_SHIFT;
1963 from = pos & (PAGE_CACHE_SIZE - 1);
1964 to = from + len;
1965
1966 /*
1967 * With delayed allocation, we don't log the i_disksize update
1968 * if there is delayed block allocation. But we still need
1969 * to journalling the i_disksize update if writes to the end
1970 * of file which has an already mapped buffer.
1971 */
1972 handle = ext4_journal_start(inode, 1);
1973 if (IS_ERR(handle)) {
1974 ret = PTR_ERR(handle);
1975 goto out;
1976 }
1977
1978 page = __grab_cache_page(mapping, index);
1979 if (!page)
1980 return -ENOMEM;
1981 *pagep = page;
1982
1983 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1984 ext4_da_get_block_prep);
1985 if (ret < 0) {
1986 unlock_page(page);
1987 ext4_journal_stop(handle);
1988 page_cache_release(page);
1989 }
1990
1991out:
1992 return ret;
1993}
1994
1995static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
1996{
1997 return !buffer_mapped(bh) || buffer_delay(bh);
1998}
1999
2000static int ext4_da_write_end(struct file *file,
2001 struct address_space *mapping,
2002 loff_t pos, unsigned len, unsigned copied,
2003 struct page *page, void *fsdata)
2004{
2005 struct inode *inode = mapping->host;
2006 int ret = 0, ret2;
2007 handle_t *handle = ext4_journal_current_handle();
2008 loff_t new_i_size;
2009
2010 /*
2011 * generic_write_end() will run mark_inode_dirty() if i_size
2012 * changes. So let's piggyback the i_disksize mark_inode_dirty
2013 * into that.
2014 */
2015
2016 new_i_size = pos + copied;
2017 if (new_i_size > EXT4_I(inode)->i_disksize)
2018 if (!walk_page_buffers(NULL, page_buffers(page),
2019 0, len, NULL, ext4_bh_unmapped_or_delay)){
2020 /*
2021 * Updating i_disksize when extending file without
2022 * needing block allocation
2023 */
2024 if (ext4_should_order_data(inode))
2025 ret = ext4_jbd2_file_inode(handle, inode);
2026
2027 EXT4_I(inode)->i_disksize = new_i_size;
2028 }
2029 ret2 = generic_write_end(file, mapping, pos, len, copied,
2030 page, fsdata);
2031 copied = ret2;
2032 if (ret2 < 0)
2033 ret = ret2;
2034 ret2 = ext4_journal_stop(handle);
2035 if (!ret)
2036 ret = ret2;
2037
2038 return ret ? ret : copied;
2039}
2040
2041static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2042{
2043 struct buffer_head *head, *bh;
2044 unsigned int curr_off = 0;
2045
2046 /*
2047 * Drop reserved blocks
2048 */
2049 BUG_ON(!PageLocked(page));
2050 if (!page_has_buffers(page))
2051 goto out;
2052
2053 head = page_buffers(page);
2054 bh = head;
2055 do {
2056 unsigned int next_off = curr_off + bh->b_size;
2057
2058 /*
2059 * is this block fully invalidated?
2060 */
2061 if (offset <= curr_off && buffer_delay(bh)) {
2062 clear_buffer_delay(bh);
2063 /* XXX: add real stuff here */
2064 }
2065 curr_off = next_off;
2066 bh = bh->b_this_page;
2067 } while (bh != head);
2068
2069out:
2070 ext4_invalidatepage(page, offset);
2071
2072 return;
2073}
2074
2075
2076/*
1411 * bmap() is special. It gets used by applications such as lilo and by 2077 * bmap() is special. It gets used by applications such as lilo and by
1412 * the swapper to find the on-disk block of a specific piece of data. 2078 * the swapper to find the on-disk block of a specific piece of data.
1413 * 2079 *
@@ -1427,6 +2093,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
1427 journal_t *journal; 2093 journal_t *journal;
1428 int err; 2094 int err;
1429 2095
2096 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
2097 test_opt(inode->i_sb, DELALLOC)) {
2098 /*
2099 * With delalloc we want to sync the file
2100 * so that we can make sure we allocate
2101 * blocks for file
2102 */
2103 filemap_write_and_wait(mapping);
2104 }
2105
1430 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2106 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
1431 /* 2107 /*
1432 * This is a REALLY heavyweight approach, but the use of 2108 * This is a REALLY heavyweight approach, but the use of
@@ -1471,11 +2147,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
1471 return 0; 2147 return 0;
1472} 2148}
1473 2149
1474static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
1475{
1476 return !buffer_mapped(bh) || buffer_delay(bh);
1477}
1478
1479/* 2150/*
1480 * Note that we don't need to start a transaction unless we're journaling data 2151 * Note that we don't need to start a transaction unless we're journaling data
1481 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2152 * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -1832,10 +2503,28 @@ static const struct address_space_operations ext4_journalled_aops = {
1832 .releasepage = ext4_releasepage, 2503 .releasepage = ext4_releasepage,
1833}; 2504};
1834 2505
2506static const struct address_space_operations ext4_da_aops = {
2507 .readpage = ext4_readpage,
2508 .readpages = ext4_readpages,
2509 .writepage = ext4_da_writepage,
2510 .writepages = ext4_da_writepages,
2511 .sync_page = block_sync_page,
2512 .write_begin = ext4_da_write_begin,
2513 .write_end = ext4_da_write_end,
2514 .bmap = ext4_bmap,
2515 .invalidatepage = ext4_da_invalidatepage,
2516 .releasepage = ext4_releasepage,
2517 .direct_IO = ext4_direct_IO,
2518 .migratepage = buffer_migrate_page,
2519};
2520
1835void ext4_set_aops(struct inode *inode) 2521void ext4_set_aops(struct inode *inode)
1836{ 2522{
1837 if (ext4_should_order_data(inode)) 2523 if (ext4_should_order_data(inode))
1838 inode->i_mapping->a_ops = &ext4_ordered_aops; 2524 inode->i_mapping->a_ops = &ext4_ordered_aops;
2525 else if (ext4_should_writeback_data(inode) &&
2526 test_opt(inode->i_sb, DELALLOC))
2527 inode->i_mapping->a_ops = &ext4_da_aops;
1839 else if (ext4_should_writeback_data(inode)) 2528 else if (ext4_should_writeback_data(inode))
1840 inode->i_mapping->a_ops = &ext4_writeback_aops; 2529 inode->i_mapping->a_ops = &ext4_writeback_aops;
1841 else 2530 else
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 629d0fa27e3a..de9d3d0eb203 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -898,7 +898,7 @@ enum {
898 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 898 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
899 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 899 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
900 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 900 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
901 Opt_mballoc, Opt_nomballoc, Opt_stripe, 901 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc,
902}; 902};
903 903
904static match_table_t tokens = { 904static match_table_t tokens = {
@@ -957,6 +957,7 @@ static match_table_t tokens = {
957 {Opt_nomballoc, "nomballoc"}, 957 {Opt_nomballoc, "nomballoc"},
958 {Opt_stripe, "stripe=%u"}, 958 {Opt_stripe, "stripe=%u"},
959 {Opt_resize, "resize"}, 959 {Opt_resize, "resize"},
960 {Opt_delalloc, "delalloc"},
960 {Opt_err, NULL}, 961 {Opt_err, NULL},
961}; 962};
962 963
@@ -1335,6 +1336,9 @@ set_qf_format:
1335 return 0; 1336 return 0;
1336 sbi->s_stripe = option; 1337 sbi->s_stripe = option;
1337 break; 1338 break;
1339 case Opt_delalloc:
1340 set_opt(sbi->s_mount_opt, DELALLOC);
1341 break;
1338 default: 1342 default:
1339 printk (KERN_ERR 1343 printk (KERN_ERR
1340 "EXT4-fs: Unrecognized mount option \"%s\" " 1344 "EXT4-fs: Unrecognized mount option \"%s\" "