aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2013-06-04 13:17:40 -0400
committerTheodore Ts'o <tytso@mit.edu>2013-06-04 13:17:40 -0400
commit4e7ea81db53465ddd753678bc4cebf95369d0984 (patch)
tree39d71bab48ba0406c17410aa37d291485cd00b17 /fs/ext4
parentfffb273997cc52f255bde5f18e7f6b4686c914fb (diff)
ext4: restructure writeback path
There are two issues with current writeback path in ext4. For one we don't necessarily map complete pages when blocksize < pagesize and thus needn't do any writeback in one iteration. We always map some blocks though so we will eventually finish mapping the page. Just if writeback races with other operations on the file, forward progress is not really guaranteed. The second problem is that current code structure makes it hard to associate all the bios to some range of pages with one io_end structure so that unwritten extents can be converted after all the bios are finished. This will be especially difficult later when io_end will be associated with reserved transaction handle. We restructure the writeback path to a relatively simple loop which first prepares extent of pages, then maps one or more extents so that no page is partially mapped, and once page is fully mapped it is submitted for IO. We keep all the mapping and IO submission information in mpage_da_data structure to somewhat reduce stack usage. Resulting code is somewhat shorter than the old one and hopefully also easier to read. Reviewed-by: Zheng Liu <wenqing.lz@taobao.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/ext4.h15
-rw-r--r--fs/ext4/inode.c1011
-rw-r--r--fs/ext4/page-io.c4
3 files changed, 487 insertions, 543 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2ebfcde5a156..90a164f365c4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,21 +177,6 @@ struct ext4_map_blocks {
177}; 177};
178 178
179/* 179/*
180 * For delayed allocation tracking
181 */
182struct mpage_da_data {
183 struct inode *inode;
184 sector_t b_blocknr; /* start block number of extent */
185 size_t b_size; /* size of extent */
186 unsigned long b_state; /* state of the extent */
187 unsigned long first_page, next_page; /* extent of pages */
188 struct writeback_control *wbc;
189 int io_done;
190 int pages_written;
191 int retval;
192};
193
194/*
195 * Flags for ext4_io_end->flags 180 * Flags for ext4_io_end->flags
196 */ 181 */
197#define EXT4_IO_END_UNWRITTEN 0x0001 182#define EXT4_IO_END_UNWRITTEN 0x0001
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2b777e51b677..5939a4742def 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1407,149 +1407,42 @@ static void ext4_da_page_release_reservation(struct page *page,
1407 * Delayed allocation stuff 1407 * Delayed allocation stuff
1408 */ 1408 */
1409 1409
1410static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd); 1410struct mpage_da_data {
1411 1411 struct inode *inode;
1412/* 1412 struct writeback_control *wbc;
1413 * mpage_da_submit_io - walks through extent of pages and try to write 1413 pgoff_t first_page; /* The first page to write */
1414 * them with writepage() call back 1414 pgoff_t next_page; /* Current page to examine */
1415 * 1415 pgoff_t last_page; /* Last page to examine */
1416 * @mpd->inode: inode
1417 * @mpd->first_page: first page of the extent
1418 * @mpd->next_page: page after the last page of the extent
1419 *
1420 * By the time mpage_da_submit_io() is called we expect all blocks
1421 * to be allocated. this may be wrong if allocation failed.
1422 *
1423 * As pages are already locked by write_cache_pages(), we can't use it
1424 */
1425static int mpage_da_submit_io(struct mpage_da_data *mpd,
1426 struct ext4_map_blocks *map)
1427{
1428 struct pagevec pvec;
1429 unsigned long index, end;
1430 int ret = 0, err, nr_pages, i;
1431 struct inode *inode = mpd->inode;
1432 struct address_space *mapping = inode->i_mapping;
1433 loff_t size = i_size_read(inode);
1434 unsigned int len, block_start;
1435 struct buffer_head *bh, *page_bufs = NULL;
1436 sector_t pblock = 0, cur_logical = 0;
1437 struct ext4_io_submit io_submit;
1438
1439 BUG_ON(mpd->next_page <= mpd->first_page);
1440 ext4_io_submit_init(&io_submit, mpd->wbc);
1441 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
1442 if (!io_submit.io_end) {
1443 ext4_da_block_invalidatepages(mpd);
1444 return -ENOMEM;
1445 }
1446 /* 1416 /*
1447 * We need to start from the first_page to the next_page - 1 1417 * Extent to map - this can be after first_page because that can be
1448 * to make sure we also write the mapped dirty buffer_heads. 1418 * fully mapped. We somewhat abuse m_flags to store whether the extent
1449 * If we look at mpd->b_blocknr we would only be looking 1419 * is delalloc or unwritten.
1450 * at the currently mapped buffer_heads.
1451 */ 1420 */
1452 index = mpd->first_page; 1421 struct ext4_map_blocks map;
1453 end = mpd->next_page - 1; 1422 struct ext4_io_submit io_submit; /* IO submission data */
1454 1423};
1455 pagevec_init(&pvec, 0);
1456 while (index <= end) {
1457 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1458 if (nr_pages == 0)
1459 break;
1460 for (i = 0; i < nr_pages; i++) {
1461 int skip_page = 0;
1462 struct page *page = pvec.pages[i];
1463
1464 index = page->index;
1465 if (index > end)
1466 break;
1467
1468 if (index == size >> PAGE_CACHE_SHIFT)
1469 len = size & ~PAGE_CACHE_MASK;
1470 else
1471 len = PAGE_CACHE_SIZE;
1472 if (map) {
1473 cur_logical = index << (PAGE_CACHE_SHIFT -
1474 inode->i_blkbits);
1475 pblock = map->m_pblk + (cur_logical -
1476 map->m_lblk);
1477 }
1478 index++;
1479
1480 BUG_ON(!PageLocked(page));
1481 BUG_ON(PageWriteback(page));
1482
1483 bh = page_bufs = page_buffers(page);
1484 block_start = 0;
1485 do {
1486 if (map && (cur_logical >= map->m_lblk) &&
1487 (cur_logical <= (map->m_lblk +
1488 (map->m_len - 1)))) {
1489 if (buffer_delay(bh)) {
1490 clear_buffer_delay(bh);
1491 bh->b_blocknr = pblock;
1492 }
1493 if (buffer_unwritten(bh) ||
1494 buffer_mapped(bh))
1495 BUG_ON(bh->b_blocknr != pblock);
1496 if (map->m_flags & EXT4_MAP_UNINIT)
1497 set_buffer_uninit(bh);
1498 clear_buffer_unwritten(bh);
1499 }
1500
1501 /*
1502 * skip page if block allocation undone and
1503 * block is dirty
1504 */
1505 if (ext4_bh_delay_or_unwritten(NULL, bh))
1506 skip_page = 1;
1507 bh = bh->b_this_page;
1508 block_start += bh->b_size;
1509 cur_logical++;
1510 pblock++;
1511 } while (bh != page_bufs);
1512
1513 if (skip_page) {
1514 unlock_page(page);
1515 continue;
1516 }
1517
1518 clear_page_dirty_for_io(page);
1519 err = ext4_bio_write_page(&io_submit, page, len,
1520 mpd->wbc);
1521 if (!err)
1522 mpd->pages_written++;
1523 /*
1524 * In error case, we have to continue because
1525 * remaining pages are still locked
1526 */
1527 if (ret == 0)
1528 ret = err;
1529 }
1530 pagevec_release(&pvec);
1531 }
1532 ext4_io_submit(&io_submit);
1533 /* Drop io_end reference we got from init */
1534 ext4_put_io_end_defer(io_submit.io_end);
1535 return ret;
1536}
1537 1424
1538static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) 1425static void mpage_release_unused_pages(struct mpage_da_data *mpd,
1426 bool invalidate)
1539{ 1427{
1540 int nr_pages, i; 1428 int nr_pages, i;
1541 pgoff_t index, end; 1429 pgoff_t index, end;
1542 struct pagevec pvec; 1430 struct pagevec pvec;
1543 struct inode *inode = mpd->inode; 1431 struct inode *inode = mpd->inode;
1544 struct address_space *mapping = inode->i_mapping; 1432 struct address_space *mapping = inode->i_mapping;
1545 ext4_lblk_t start, last; 1433
1434 /* This is necessary when next_page == 0. */
1435 if (mpd->first_page >= mpd->next_page)
1436 return;
1546 1437
1547 index = mpd->first_page; 1438 index = mpd->first_page;
1548 end = mpd->next_page - 1; 1439 end = mpd->next_page - 1;
1549 1440 if (invalidate) {
1550 start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1441 ext4_lblk_t start, last;
1551 last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1442 start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1552 ext4_es_remove_extent(inode, start, last - start + 1); 1443 last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1444 ext4_es_remove_extent(inode, start, last - start + 1);
1445 }
1553 1446
1554 pagevec_init(&pvec, 0); 1447 pagevec_init(&pvec, 0);
1555 while (index <= end) { 1448 while (index <= end) {
@@ -1562,14 +1455,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1562 break; 1455 break;
1563 BUG_ON(!PageLocked(page)); 1456 BUG_ON(!PageLocked(page));
1564 BUG_ON(PageWriteback(page)); 1457 BUG_ON(PageWriteback(page));
1565 block_invalidatepage(page, 0, PAGE_CACHE_SIZE); 1458 if (invalidate) {
1566 ClearPageUptodate(page); 1459 block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
1460 ClearPageUptodate(page);
1461 }
1567 unlock_page(page); 1462 unlock_page(page);
1568 } 1463 }
1569 index = pvec.pages[nr_pages - 1]->index + 1; 1464 index = pvec.pages[nr_pages - 1]->index + 1;
1570 pagevec_release(&pvec); 1465 pagevec_release(&pvec);
1571 } 1466 }
1572 return;
1573} 1467}
1574 1468
1575static void ext4_print_free_blocks(struct inode *inode) 1469static void ext4_print_free_blocks(struct inode *inode)
@@ -1598,215 +1492,6 @@ static void ext4_print_free_blocks(struct inode *inode)
1598 return; 1492 return;
1599} 1493}
1600 1494
1601/*
1602 * mpage_da_map_and_submit - go through given space, map them
1603 * if necessary, and then submit them for I/O
1604 *
1605 * @mpd - bh describing space
1606 *
1607 * The function skips space we know is already mapped to disk blocks.
1608 *
1609 */
1610static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1611{
1612 int err, blks, get_blocks_flags;
1613 struct ext4_map_blocks map, *mapp = NULL;
1614 sector_t next = mpd->b_blocknr;
1615 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
1616 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
1617 handle_t *handle = NULL;
1618
1619 /*
1620 * If the blocks are mapped already, or we couldn't accumulate
1621 * any blocks, then proceed immediately to the submission stage.
1622 */
1623 if ((mpd->b_size == 0) ||
1624 ((mpd->b_state & (1 << BH_Mapped)) &&
1625 !(mpd->b_state & (1 << BH_Delay)) &&
1626 !(mpd->b_state & (1 << BH_Unwritten))))
1627 goto submit_io;
1628
1629 handle = ext4_journal_current_handle();
1630 BUG_ON(!handle);
1631
1632 /*
1633 * Call ext4_map_blocks() to allocate any delayed allocation
1634 * blocks, or to convert an uninitialized extent to be
1635 * initialized (in the case where we have written into
1636 * one or more preallocated blocks).
1637 *
1638 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
1639 * indicate that we are on the delayed allocation path. This
1640 * affects functions in many different parts of the allocation
1641 * call path. This flag exists primarily because we don't
1642 * want to change *many* call functions, so ext4_map_blocks()
1643 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
1644 * inode's allocation semaphore is taken.
1645 *
1646 * If the blocks in questions were delalloc blocks, set
1647 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
1648 * variables are updated after the blocks have been allocated.
1649 */
1650 map.m_lblk = next;
1651 map.m_len = max_blocks;
1652 /*
1653 * We're in delalloc path and it is possible that we're going to
1654 * need more metadata blocks than previously reserved. However
1655 * we must not fail because we're in writeback and there is
1656 * nothing we can do about it so it might result in data loss.
1657 * So use reserved blocks to allocate metadata if possible.
1658 */
1659 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
1660 EXT4_GET_BLOCKS_METADATA_NOFAIL;
1661 if (ext4_should_dioread_nolock(mpd->inode))
1662 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
1663 if (mpd->b_state & (1 << BH_Delay))
1664 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
1665
1666
1667 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
1668 if (blks < 0) {
1669 struct super_block *sb = mpd->inode->i_sb;
1670
1671 err = blks;
1672 /*
1673 * If get block returns EAGAIN or ENOSPC and there
1674 * appears to be free blocks we will just let
1675 * mpage_da_submit_io() unlock all of the pages.
1676 */
1677 if (err == -EAGAIN)
1678 goto submit_io;
1679
1680 if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
1681 mpd->retval = err;
1682 goto submit_io;
1683 }
1684
1685 /*
1686 * get block failure will cause us to loop in
1687 * writepages, because a_ops->writepage won't be able
1688 * to make progress. The page will be redirtied by
1689 * writepage and writepages will again try to write
1690 * the same.
1691 */
1692 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
1693 ext4_msg(sb, KERN_CRIT,
1694 "delayed block allocation failed for inode %lu "
1695 "at logical offset %llu with max blocks %zd "
1696 "with error %d", mpd->inode->i_ino,
1697 (unsigned long long) next,
1698 mpd->b_size >> mpd->inode->i_blkbits, err);
1699 ext4_msg(sb, KERN_CRIT,
1700 "This should not happen!! Data will be lost");
1701 if (err == -ENOSPC)
1702 ext4_print_free_blocks(mpd->inode);
1703 }
1704 /* invalidate all the pages */
1705 ext4_da_block_invalidatepages(mpd);
1706
1707 /* Mark this page range as having been completed */
1708 mpd->io_done = 1;
1709 return;
1710 }
1711 BUG_ON(blks == 0);
1712
1713 mapp = &map;
1714 if (map.m_flags & EXT4_MAP_NEW) {
1715 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
1716 int i;
1717
1718 for (i = 0; i < map.m_len; i++)
1719 unmap_underlying_metadata(bdev, map.m_pblk + i);
1720 }
1721
1722 /*
1723 * Update on-disk size along with block allocation.
1724 */
1725 disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
1726 if (disksize > i_size_read(mpd->inode))
1727 disksize = i_size_read(mpd->inode);
1728 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
1729 ext4_update_i_disksize(mpd->inode, disksize);
1730 err = ext4_mark_inode_dirty(handle, mpd->inode);
1731 if (err)
1732 ext4_error(mpd->inode->i_sb,
1733 "Failed to mark inode %lu dirty",
1734 mpd->inode->i_ino);
1735 }
1736
1737submit_io:
1738 mpage_da_submit_io(mpd, mapp);
1739 mpd->io_done = 1;
1740}
1741
1742#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
1743 (1 << BH_Delay) | (1 << BH_Unwritten))
1744
1745/*
1746 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1747 *
1748 * @mpd->lbh - extent of blocks
1749 * @logical - logical number of the block in the file
1750 * @b_state - b_state of the buffer head added
1751 *
1752 * the function is used to collect contig. blocks in same state
1753 */
1754static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
1755 unsigned long b_state)
1756{
1757 sector_t next;
1758 int blkbits = mpd->inode->i_blkbits;
1759 int nrblocks = mpd->b_size >> blkbits;
1760
1761 /*
1762 * XXX Don't go larger than mballoc is willing to allocate
1763 * This is a stopgap solution. We eventually need to fold
1764 * mpage_da_submit_io() into this function and then call
1765 * ext4_map_blocks() multiple times in a loop
1766 */
1767 if (nrblocks >= (8*1024*1024 >> blkbits))
1768 goto flush_it;
1769
1770 /* check if the reserved journal credits might overflow */
1771 if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
1772 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
1773 /*
1774 * With non-extent format we are limited by the journal
1775 * credit available. Total credit needed to insert
1776 * nrblocks contiguous blocks is dependent on the
1777 * nrblocks. So limit nrblocks.
1778 */
1779 goto flush_it;
1780 }
1781 }
1782 /*
1783 * First block in the extent
1784 */
1785 if (mpd->b_size == 0) {
1786 mpd->b_blocknr = logical;
1787 mpd->b_size = 1 << blkbits;
1788 mpd->b_state = b_state & BH_FLAGS;
1789 return;
1790 }
1791
1792 next = mpd->b_blocknr + nrblocks;
1793 /*
1794 * Can we merge the block to our big extent?
1795 */
1796 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
1797 mpd->b_size += 1 << blkbits;
1798 return;
1799 }
1800
1801flush_it:
1802 /*
1803 * We couldn't merge the block to our extent, so we
1804 * need to flush current extent and start new one
1805 */
1806 mpage_da_map_and_submit(mpd);
1807 return;
1808}
1809
1810static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) 1495static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1811{ 1496{
1812 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); 1497 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
@@ -2204,6 +1889,8 @@ static int ext4_writepage(struct page *page,
2204 return ret; 1889 return ret;
2205} 1890}
2206 1891
1892#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
1893
2207/* 1894/*
2208 * mballoc gives us at most this number of blocks... 1895 * mballoc gives us at most this number of blocks...
2209 * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). 1896 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
@@ -2212,6 +1899,315 @@ static int ext4_writepage(struct page *page,
2212#define MAX_WRITEPAGES_EXTENT_LEN 2048 1899#define MAX_WRITEPAGES_EXTENT_LEN 2048
2213 1900
2214/* 1901/*
1902 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
1903 *
1904 * @mpd - extent of blocks
1905 * @lblk - logical number of the block in the file
1906 * @b_state - b_state of the buffer head added
1907 *
1908 * the function is used to collect contig. blocks in same state
1909 */
1910static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
1911 unsigned long b_state)
1912{
1913 struct ext4_map_blocks *map = &mpd->map;
1914
1915 /* Don't go larger than mballoc is willing to allocate */
1916 if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
1917 return 0;
1918
1919 /* First block in the extent? */
1920 if (map->m_len == 0) {
1921 map->m_lblk = lblk;
1922 map->m_len = 1;
1923 map->m_flags = b_state & BH_FLAGS;
1924 return 1;
1925 }
1926
1927 /* Can we merge the block to our big extent? */
1928 if (lblk == map->m_lblk + map->m_len &&
1929 (b_state & BH_FLAGS) == map->m_flags) {
1930 map->m_len++;
1931 return 1;
1932 }
1933 return 0;
1934}
1935
1936static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
1937 struct buffer_head *head,
1938 struct buffer_head *bh,
1939 ext4_lblk_t lblk)
1940{
1941 struct inode *inode = mpd->inode;
1942 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
1943 >> inode->i_blkbits;
1944
1945 do {
1946 BUG_ON(buffer_locked(bh));
1947
1948 if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
1949 (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
1950 lblk >= blocks) {
1951 /* Found extent to map? */
1952 if (mpd->map.m_len)
1953 return false;
1954 if (lblk >= blocks)
1955 return true;
1956 continue;
1957 }
1958 if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
1959 return false;
1960 } while (lblk++, (bh = bh->b_this_page) != head);
1961 return true;
1962}
1963
1964static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
1965{
1966 int len;
1967 loff_t size = i_size_read(mpd->inode);
1968 int err;
1969
1970 BUG_ON(page->index != mpd->first_page);
1971 if (page->index == size >> PAGE_CACHE_SHIFT)
1972 len = size & ~PAGE_CACHE_MASK;
1973 else
1974 len = PAGE_CACHE_SIZE;
1975 clear_page_dirty_for_io(page);
1976 err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
1977 if (!err)
1978 mpd->wbc->nr_to_write--;
1979 mpd->first_page++;
1980
1981 return err;
1982}
1983
1984/*
1985 * mpage_map_buffers - update buffers corresponding to changed extent and
1986 * submit fully mapped pages for IO
1987 *
1988 * @mpd - description of extent to map, on return next extent to map
1989 *
1990 * Scan buffers corresponding to changed extent (we expect corresponding pages
1991 * to be already locked) and update buffer state according to new extent state.
1992 * We map delalloc buffers to their physical location, clear unwritten bits,
1993 * and mark buffers as uninit when we perform writes to uninitialized extents
1994 * and do extent conversion after IO is finished. If the last page is not fully
1995 * mapped, we update @map to the next extent in the last page that needs
1996 * mapping. Otherwise we submit the page for IO.
1997 */
1998static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
1999{
2000 struct pagevec pvec;
2001 int nr_pages, i;
2002 struct inode *inode = mpd->inode;
2003 struct buffer_head *head, *bh;
2004 int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
2005 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
2006 >> inode->i_blkbits;
2007 pgoff_t start, end;
2008 ext4_lblk_t lblk;
2009 sector_t pblock;
2010 int err;
2011
2012 start = mpd->map.m_lblk >> bpp_bits;
2013 end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
2014 lblk = start << bpp_bits;
2015 pblock = mpd->map.m_pblk;
2016
2017 pagevec_init(&pvec, 0);
2018 while (start <= end) {
2019 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
2020 PAGEVEC_SIZE);
2021 if (nr_pages == 0)
2022 break;
2023 for (i = 0; i < nr_pages; i++) {
2024 struct page *page = pvec.pages[i];
2025
2026 if (page->index > end)
2027 break;
2028 /* Upto 'end' pages must be contiguous */
2029 BUG_ON(page->index != start);
2030 bh = head = page_buffers(page);
2031 do {
2032 if (lblk < mpd->map.m_lblk)
2033 continue;
2034 if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
2035 /*
2036 * Buffer after end of mapped extent.
2037 * Find next buffer in the page to map.
2038 */
2039 mpd->map.m_len = 0;
2040 mpd->map.m_flags = 0;
2041 add_page_bufs_to_extent(mpd, head, bh,
2042 lblk);
2043 pagevec_release(&pvec);
2044 return 0;
2045 }
2046 if (buffer_delay(bh)) {
2047 clear_buffer_delay(bh);
2048 bh->b_blocknr = pblock++;
2049 }
2050 if (mpd->map.m_flags & EXT4_MAP_UNINIT)
2051 set_buffer_uninit(bh);
2052 clear_buffer_unwritten(bh);
2053 } while (++lblk < blocks &&
2054 (bh = bh->b_this_page) != head);
2055
2056 /*
2057 * FIXME: This is going to break if dioread_nolock
2058 * supports blocksize < pagesize as we will try to
2059 * convert potentially unmapped parts of inode.
2060 */
2061 mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
2062 /* Page fully mapped - let IO run! */
2063 err = mpage_submit_page(mpd, page);
2064 if (err < 0) {
2065 pagevec_release(&pvec);
2066 return err;
2067 }
2068 start++;
2069 }
2070 pagevec_release(&pvec);
2071 }
2072 /* Extent fully mapped and matches with page boundary. We are done. */
2073 mpd->map.m_len = 0;
2074 mpd->map.m_flags = 0;
2075 return 0;
2076}
2077
2078static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
2079{
2080 struct inode *inode = mpd->inode;
2081 struct ext4_map_blocks *map = &mpd->map;
2082 int get_blocks_flags;
2083 int err;
2084
2085 trace_ext4_da_write_pages_extent(inode, map);
2086 /*
2087 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
2088 * to convert an uninitialized extent to be initialized (in the case
2089 * where we have written into one or more preallocated blocks). It is
2090 * possible that we're going to need more metadata blocks than
2091 * previously reserved. However we must not fail because we're in
2092 * writeback and there is nothing we can do about it so it might result
2093 * in data loss. So use reserved blocks to allocate metadata if
2094 * possible.
2095 *
2096 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
2097 * in question are delalloc blocks. This affects functions in many
2098 * different parts of the allocation call path. This flag exists
2099 * primarily because we don't want to change *many* call functions, so
2100 * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
2101 * once the inode's allocation semaphore is taken.
2102 */
2103 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
2104 EXT4_GET_BLOCKS_METADATA_NOFAIL;
2105 if (ext4_should_dioread_nolock(inode))
2106 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2107 if (map->m_flags & (1 << BH_Delay))
2108 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2109
2110 err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
2111 if (err < 0)
2112 return err;
2113
2114 BUG_ON(map->m_len == 0);
2115 if (map->m_flags & EXT4_MAP_NEW) {
2116 struct block_device *bdev = inode->i_sb->s_bdev;
2117 int i;
2118
2119 for (i = 0; i < map->m_len; i++)
2120 unmap_underlying_metadata(bdev, map->m_pblk + i);
2121 }
2122 return 0;
2123}
2124
2125/*
2126 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
2127 * mpd->len and submit pages underlying it for IO
2128 *
2129 * @handle - handle for journal operations
2130 * @mpd - extent to map
2131 *
2132 * The function maps extent starting at mpd->lblk of length mpd->len. If it is
2133 * delayed, blocks are allocated, if it is unwritten, we may need to convert
2134 * them to initialized or split the described range from larger unwritten
2135 * extent. Note that we need not map all the described range since allocation
2136 * can return less blocks or the range is covered by more unwritten extents. We
2137 * cannot map more because we are limited by reserved transaction credits. On
2138 * the other hand we always make sure that the last touched page is fully
2139 * mapped so that it can be written out (and thus forward progress is
2140 * guaranteed). After mapping we submit all mapped pages for IO.
2141 */
2142static int mpage_map_and_submit_extent(handle_t *handle,
2143 struct mpage_da_data *mpd)
2144{
2145 struct inode *inode = mpd->inode;
2146 struct ext4_map_blocks *map = &mpd->map;
2147 int err;
2148 loff_t disksize;
2149
2150 mpd->io_submit.io_end->offset =
2151 ((loff_t)map->m_lblk) << inode->i_blkbits;
2152 while (map->m_len) {
2153 err = mpage_map_one_extent(handle, mpd);
2154 if (err < 0) {
2155 struct super_block *sb = inode->i_sb;
2156
2157 /*
2158 * Need to commit transaction to free blocks. Let upper
2159 * layers sort it out.
2160 */
2161 if (err == -ENOSPC && ext4_count_free_clusters(sb))
2162 return -ENOSPC;
2163
2164 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
2165 ext4_msg(sb, KERN_CRIT,
2166 "Delayed block allocation failed for "
2167 "inode %lu at logical offset %llu with"
2168 " max blocks %u with error %d",
2169 inode->i_ino,
2170 (unsigned long long)map->m_lblk,
2171 (unsigned)map->m_len, err);
2172 ext4_msg(sb, KERN_CRIT,
2173 "This should not happen!! Data will "
2174 "be lost\n");
2175 if (err == -ENOSPC)
2176 ext4_print_free_blocks(inode);
2177 }
2178 /* invalidate all the pages */
2179 mpage_release_unused_pages(mpd, true);
2180 return err;
2181 }
2182 /*
2183 * Update buffer state, submit mapped pages, and get us new
2184 * extent to map
2185 */
2186 err = mpage_map_and_submit_buffers(mpd);
2187 if (err < 0)
2188 return err;
2189 }
2190
2191 /* Update on-disk size after IO is submitted */
2192 disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
2193 if (disksize > i_size_read(inode))
2194 disksize = i_size_read(inode);
2195 if (disksize > EXT4_I(inode)->i_disksize) {
2196 int err2;
2197
2198 ext4_update_i_disksize(inode, disksize);
2199 err2 = ext4_mark_inode_dirty(handle, inode);
2200 if (err2)
2201 ext4_error(inode->i_sb,
2202 "Failed to mark inode %lu dirty",
2203 inode->i_ino);
2204 if (!err)
2205 err = err2;
2206 }
2207 return err;
2208}
2209
2210/*
2215 * Calculate the total number of credits to reserve for one writepages 2211 * Calculate the total number of credits to reserve for one writepages
2216 * iteration. This is called from ext4_da_writepages(). We map an extent of 2212 * iteration. This is called from ext4_da_writepages(). We map an extent of
2217 * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping 2213 * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
@@ -2227,44 +2223,49 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2227} 2223}
2228 2224
2229/* 2225/*
2230 * write_cache_pages_da - walk the list of dirty pages of the given 2226 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
2231 * address space and accumulate pages that need writing, and call 2227 * and underlying extent to map
2232 * mpage_da_map_and_submit to map a single contiguous memory region 2228 *
2233 * and then write them. 2229 * @mpd - where to look for pages
2230 *
2231 * Walk dirty pages in the mapping. If they are fully mapped, submit them for
2232 * IO immediately. When we find a page which isn't mapped we start accumulating
2233 * extent of buffers underlying these pages that needs mapping (formed by
2234 * either delayed or unwritten buffers). We also lock the pages containing
2235 * these buffers. The extent found is returned in @mpd structure (starting at
2236 * mpd->lblk with length mpd->len blocks).
2237 *
2238 * Note that this function can attach bios to one io_end structure which are
2239 * neither logically nor physically contiguous. Although it may seem as an
2240 * unnecessary complication, it is actually inevitable in blocksize < pagesize
2241 * case as we need to track IO to all buffers underlying a page in one io_end.
2234 */ 2242 */
2235static int write_cache_pages_da(handle_t *handle, 2243static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
2236 struct address_space *mapping,
2237 struct writeback_control *wbc,
2238 struct mpage_da_data *mpd,
2239 pgoff_t *done_index)
2240{ 2244{
2241 struct buffer_head *bh, *head; 2245 struct address_space *mapping = mpd->inode->i_mapping;
2242 struct inode *inode = mapping->host; 2246 struct pagevec pvec;
2243 struct pagevec pvec; 2247 unsigned int nr_pages;
2244 unsigned int nr_pages; 2248 pgoff_t index = mpd->first_page;
2245 sector_t logical; 2249 pgoff_t end = mpd->last_page;
2246 pgoff_t index, end; 2250 int tag;
2247 long nr_to_write = wbc->nr_to_write; 2251 int i, err = 0;
2248 int i, tag, ret = 0; 2252 int blkbits = mpd->inode->i_blkbits;
2249 2253 ext4_lblk_t lblk;
2250 memset(mpd, 0, sizeof(struct mpage_da_data)); 2254 struct buffer_head *head;
2251 mpd->wbc = wbc;
2252 mpd->inode = inode;
2253 pagevec_init(&pvec, 0);
2254 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2255 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2256 2255
2257 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2256 if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
2258 tag = PAGECACHE_TAG_TOWRITE; 2257 tag = PAGECACHE_TAG_TOWRITE;
2259 else 2258 else
2260 tag = PAGECACHE_TAG_DIRTY; 2259 tag = PAGECACHE_TAG_DIRTY;
2261 2260
2262 *done_index = index; 2261 pagevec_init(&pvec, 0);
2262 mpd->map.m_len = 0;
2263 mpd->next_page = index;
2263 while (index <= end) { 2264 while (index <= end) {
2264 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2265 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2265 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2266 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2266 if (nr_pages == 0) 2267 if (nr_pages == 0)
2267 return 0; 2268 goto out;
2268 2269
2269 for (i = 0; i < nr_pages; i++) { 2270 for (i = 0; i < nr_pages; i++) {
2270 struct page *page = pvec.pages[i]; 2271 struct page *page = pvec.pages[i];
@@ -2279,31 +2280,21 @@ static int write_cache_pages_da(handle_t *handle,
2279 if (page->index > end) 2280 if (page->index > end)
2280 goto out; 2281 goto out;
2281 2282
2282 *done_index = page->index + 1; 2283 /* If we can't merge this page, we are done. */
2283 2284 if (mpd->map.m_len > 0 && mpd->next_page != page->index)
2284 /* 2285 goto out;
2285 * If we can't merge this page, and we have
2286 * accumulated an contiguous region, write it
2287 */
2288 if ((mpd->next_page != page->index) &&
2289 (mpd->next_page != mpd->first_page)) {
2290 mpage_da_map_and_submit(mpd);
2291 goto ret_extent_tail;
2292 }
2293 2286
2294 lock_page(page); 2287 lock_page(page);
2295
2296 /* 2288 /*
2297 * If the page is no longer dirty, or its 2289 * If the page is no longer dirty, or its mapping no
2298 * mapping no longer corresponds to inode we 2290 * longer corresponds to inode we are writing (which
2299 * are writing (which means it has been 2291 * means it has been truncated or invalidated), or the
2300 * truncated or invalidated), or the page is 2292 * page is already under writeback and we are not doing
2301 * already under writeback and we are not 2293 * a data integrity writeback, skip the page
2302 * doing a data integrity writeback, skip the page
2303 */ 2294 */
2304 if (!PageDirty(page) || 2295 if (!PageDirty(page) ||
2305 (PageWriteback(page) && 2296 (PageWriteback(page) &&
2306 (wbc->sync_mode == WB_SYNC_NONE)) || 2297 (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
2307 unlikely(page->mapping != mapping)) { 2298 unlikely(page->mapping != mapping)) {
2308 unlock_page(page); 2299 unlock_page(page);
2309 continue; 2300 continue;
@@ -2312,101 +2303,57 @@ static int write_cache_pages_da(handle_t *handle,
2312 wait_on_page_writeback(page); 2303 wait_on_page_writeback(page);
2313 BUG_ON(PageWriteback(page)); 2304 BUG_ON(PageWriteback(page));
2314 2305
2315 /* 2306 if (mpd->map.m_len == 0)
2316 * If we have inline data and arrive here, it means that
2317 * we will soon create the block for the 1st page, so
2318 * we'd better clear the inline data here.
2319 */
2320 if (ext4_has_inline_data(inode)) {
2321 BUG_ON(ext4_test_inode_state(inode,
2322 EXT4_STATE_MAY_INLINE_DATA));
2323 ext4_destroy_inline_data(handle, inode);
2324 }
2325
2326 if (mpd->next_page != page->index)
2327 mpd->first_page = page->index; 2307 mpd->first_page = page->index;
2328 mpd->next_page = page->index + 1; 2308 mpd->next_page = page->index + 1;
2329 logical = (sector_t) page->index <<
2330 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2331
2332 /* Add all dirty buffers to mpd */ 2309 /* Add all dirty buffers to mpd */
2310 lblk = ((ext4_lblk_t)page->index) <<
2311 (PAGE_CACHE_SHIFT - blkbits);
2333 head = page_buffers(page); 2312 head = page_buffers(page);
2334 bh = head; 2313 if (!add_page_bufs_to_extent(mpd, head, head, lblk))
2335 do { 2314 goto out;
2336 BUG_ON(buffer_locked(bh)); 2315 /* So far everything mapped? Submit the page for IO. */
2337 /* 2316 if (mpd->map.m_len == 0) {
2338 * We need to try to allocate unmapped blocks 2317 err = mpage_submit_page(mpd, page);
2339 * in the same page. Otherwise we won't make 2318 if (err < 0)
2340 * progress with the page in ext4_writepage
2341 */
2342 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2343 mpage_add_bh_to_extent(mpd, logical,
2344 bh->b_state);
2345 if (mpd->io_done)
2346 goto ret_extent_tail;
2347 } else if (buffer_dirty(bh) &&
2348 buffer_mapped(bh)) {
2349 /*
2350 * mapped dirty buffer. We need to
2351 * update the b_state because we look
2352 * at b_state in mpage_da_map_blocks.
2353 * We don't update b_size because if we
2354 * find an unmapped buffer_head later
2355 * we need to use the b_state flag of
2356 * that buffer_head.
2357 */
2358 if (mpd->b_size == 0)
2359 mpd->b_state =
2360 bh->b_state & BH_FLAGS;
2361 }
2362 logical++;
2363 } while ((bh = bh->b_this_page) != head);
2364
2365 if (nr_to_write > 0) {
2366 nr_to_write--;
2367 if (nr_to_write == 0 &&
2368 wbc->sync_mode == WB_SYNC_NONE)
2369 /*
2370 * We stop writing back only if we are
2371 * not doing integrity sync. In case of
2372 * integrity sync we have to keep going
2373 * because someone may be concurrently
2374 * dirtying pages, and we might have
2375 * synced a lot of newly appeared dirty
2376 * pages, but have not synced all of the
2377 * old dirty pages.
2378 */
2379 goto out; 2319 goto out;
2380 } 2320 }
2321
2322 /*
2323 * Accumulated enough dirty pages? This doesn't apply
2324 * to WB_SYNC_ALL mode. For integrity sync we have to
2325 * keep going because someone may be concurrently
2326 * dirtying pages, and we might have synced a lot of
2327 * newly appeared dirty pages, but have not synced all
2328 * of the old dirty pages.
2329 */
2330 if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
2331 mpd->next_page - mpd->first_page >=
2332 mpd->wbc->nr_to_write)
2333 goto out;
2381 } 2334 }
2382 pagevec_release(&pvec); 2335 pagevec_release(&pvec);
2383 cond_resched(); 2336 cond_resched();
2384 } 2337 }
2385 return 0; 2338 return 0;
2386ret_extent_tail:
2387 ret = MPAGE_DA_EXTENT_TAIL;
2388out: 2339out:
2389 pagevec_release(&pvec); 2340 pagevec_release(&pvec);
2390 cond_resched(); 2341 return err;
2391 return ret;
2392} 2342}
2393 2343
2394
2395static int ext4_da_writepages(struct address_space *mapping, 2344static int ext4_da_writepages(struct address_space *mapping,
2396 struct writeback_control *wbc) 2345 struct writeback_control *wbc)
2397{ 2346{
2398 pgoff_t index; 2347 pgoff_t writeback_index = 0;
2348 long nr_to_write = wbc->nr_to_write;
2399 int range_whole = 0; 2349 int range_whole = 0;
2350 int cycled = 1;
2400 handle_t *handle = NULL; 2351 handle_t *handle = NULL;
2401 struct mpage_da_data mpd; 2352 struct mpage_da_data mpd;
2402 struct inode *inode = mapping->host; 2353 struct inode *inode = mapping->host;
2403 int pages_written = 0;
2404 int range_cyclic, cycled = 1, io_done = 0;
2405 int needed_blocks, ret = 0; 2354 int needed_blocks, ret = 0;
2406 loff_t range_start = wbc->range_start;
2407 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2355 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2408 pgoff_t done_index = 0; 2356 bool done;
2409 pgoff_t end;
2410 struct blk_plug plug; 2357 struct blk_plug plug;
2411 2358
2412 trace_ext4_da_writepages(inode, wbc); 2359 trace_ext4_da_writepages(inode, wbc);
@@ -2432,40 +2379,65 @@ static int ext4_da_writepages(struct address_space *mapping,
2432 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2379 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2433 return -EROFS; 2380 return -EROFS;
2434 2381
2382 /*
2383 * If we have inline data and arrive here, it means that
2384 * we will soon create the block for the 1st page, so
2385 * we'd better clear the inline data here.
2386 */
2387 if (ext4_has_inline_data(inode)) {
2388 /* Just inode will be modified... */
2389 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
2390 if (IS_ERR(handle)) {
2391 ret = PTR_ERR(handle);
2392 goto out_writepages;
2393 }
2394 BUG_ON(ext4_test_inode_state(inode,
2395 EXT4_STATE_MAY_INLINE_DATA));
2396 ext4_destroy_inline_data(handle, inode);
2397 ext4_journal_stop(handle);
2398 }
2399
2435 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2400 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2436 range_whole = 1; 2401 range_whole = 1;
2437 2402
2438 range_cyclic = wbc->range_cyclic;
2439 if (wbc->range_cyclic) { 2403 if (wbc->range_cyclic) {
2440 index = mapping->writeback_index; 2404 writeback_index = mapping->writeback_index;
2441 if (index) 2405 if (writeback_index)
2442 cycled = 0; 2406 cycled = 0;
2443 wbc->range_start = index << PAGE_CACHE_SHIFT; 2407 mpd.first_page = writeback_index;
2444 wbc->range_end = LLONG_MAX; 2408 mpd.last_page = -1;
2445 wbc->range_cyclic = 0;
2446 end = -1;
2447 } else { 2409 } else {
2448 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2410 mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
2449 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2411 mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
2450 } 2412 }
2451 2413
2414 mpd.inode = inode;
2415 mpd.wbc = wbc;
2416 ext4_io_submit_init(&mpd.io_submit, wbc);
2452retry: 2417retry:
2453 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2418 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2454 tag_pages_for_writeback(mapping, index, end); 2419 tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
2455 2420 done = false;
2456 blk_start_plug(&plug); 2421 blk_start_plug(&plug);
2457 while (!ret && wbc->nr_to_write > 0) { 2422 while (!done && mpd.first_page <= mpd.last_page) {
2423 /* For each extent of pages we use new io_end */
2424 mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
2425 if (!mpd.io_submit.io_end) {
2426 ret = -ENOMEM;
2427 break;
2428 }
2458 2429
2459 /* 2430 /*
2460 * we insert one extent at a time. So we need 2431 * We have two constraints: We find one extent to map and we
2461 * credit needed for single extent allocation. 2432 * must always write out whole page (makes a difference when
2462 * journalled mode is currently not supported 2433 * blocksize < pagesize) so that we don't block on IO when we
2463 * by delalloc 2434 * try to write out the rest of the page. Journalled mode is
2435 * not supported by delalloc.
2464 */ 2436 */
2465 BUG_ON(ext4_should_journal_data(inode)); 2437 BUG_ON(ext4_should_journal_data(inode));
2466 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2438 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2467 2439
2468 /* start a new transaction*/ 2440 /* start a new transaction */
2469 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 2441 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
2470 needed_blocks); 2442 needed_blocks);
2471 if (IS_ERR(handle)) { 2443 if (IS_ERR(handle)) {
@@ -2473,76 +2445,67 @@ retry:
2473 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2445 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2474 "%ld pages, ino %lu; err %d", __func__, 2446 "%ld pages, ino %lu; err %d", __func__,
2475 wbc->nr_to_write, inode->i_ino, ret); 2447 wbc->nr_to_write, inode->i_ino, ret);
2476 blk_finish_plug(&plug); 2448 /* Release allocated io_end */
2477 goto out_writepages; 2449 ext4_put_io_end(mpd.io_submit.io_end);
2450 break;
2478 } 2451 }
2479 2452
2480 /* 2453 trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
2481 * Now call write_cache_pages_da() to find the next 2454 ret = mpage_prepare_extent_to_map(&mpd);
2482 * contiguous region of logical blocks that need 2455 if (!ret) {
2483 * blocks to be allocated by ext4 and submit them. 2456 if (mpd.map.m_len)
2484 */ 2457 ret = mpage_map_and_submit_extent(handle, &mpd);
2485 ret = write_cache_pages_da(handle, mapping, 2458 else {
2486 wbc, &mpd, &done_index); 2459 /*
2487 /* 2460 * We scanned the whole range (or exhausted
2488 * If we have a contiguous extent of pages and we 2461 * nr_to_write), submitted what was mapped and
2489 * haven't done the I/O yet, map the blocks and submit 2462 * didn't find anything needing mapping. We are
2490 * them for I/O. 2463 * done.
2491 */ 2464 */
2492 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 2465 done = true;
2493 mpage_da_map_and_submit(&mpd); 2466 }
2494 ret = MPAGE_DA_EXTENT_TAIL;
2495 } 2467 }
2496 trace_ext4_da_write_pages(inode, &mpd);
2497 wbc->nr_to_write -= mpd.pages_written;
2498
2499 ext4_journal_stop(handle); 2468 ext4_journal_stop(handle);
2500 2469 /* Submit prepared bio */
2501 if ((mpd.retval == -ENOSPC) && sbi->s_journal) { 2470 ext4_io_submit(&mpd.io_submit);
2502 /* commit the transaction which would 2471 /* Unlock pages we didn't use */
2472 mpage_release_unused_pages(&mpd, false);
2473 /* Drop our io_end reference we got from init */
2474 ext4_put_io_end(mpd.io_submit.io_end);
2475
2476 if (ret == -ENOSPC && sbi->s_journal) {
2477 /*
2478 * Commit the transaction which would
2503 * free blocks released in the transaction 2479 * free blocks released in the transaction
2504 * and try again 2480 * and try again
2505 */ 2481 */
2506 jbd2_journal_force_commit_nested(sbi->s_journal); 2482 jbd2_journal_force_commit_nested(sbi->s_journal);
2507 ret = 0; 2483 ret = 0;
2508 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2484 continue;
2509 /* 2485 }
2510 * Got one extent now try with rest of the pages. 2486 /* Fatal error - ENOMEM, EIO... */
2511 * If mpd.retval is set -EIO, journal is aborted. 2487 if (ret)
2512 * So we don't need to write any more.
2513 */
2514 pages_written += mpd.pages_written;
2515 ret = mpd.retval;
2516 io_done = 1;
2517 } else if (wbc->nr_to_write)
2518 /*
2519 * There is no more writeout needed
2520 * or we requested for a noblocking writeout
2521 * and we found the device congested
2522 */
2523 break; 2488 break;
2524 } 2489 }
2525 blk_finish_plug(&plug); 2490 blk_finish_plug(&plug);
2526 if (!io_done && !cycled) { 2491 if (!ret && !cycled) {
2527 cycled = 1; 2492 cycled = 1;
2528 index = 0; 2493 mpd.last_page = writeback_index - 1;
2529 wbc->range_start = index << PAGE_CACHE_SHIFT; 2494 mpd.first_page = 0;
2530 wbc->range_end = mapping->writeback_index - 1;
2531 goto retry; 2495 goto retry;
2532 } 2496 }
2533 2497
2534 /* Update index */ 2498 /* Update index */
2535 wbc->range_cyclic = range_cyclic;
2536 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2499 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2537 /* 2500 /*
2538 * set the writeback_index so that range_cyclic 2501 * Set the writeback_index so that range_cyclic
2539 * mode will write it back later 2502 * mode will write it back later
2540 */ 2503 */
2541 mapping->writeback_index = done_index; 2504 mapping->writeback_index = mpd.first_page;
2542 2505
2543out_writepages: 2506out_writepages:
2544 wbc->range_start = range_start; 2507 trace_ext4_da_writepages_result(inode, wbc, ret,
2545 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 2508 nr_to_write - wbc->nr_to_write);
2546 return ret; 2509 return ret;
2547} 2510}
2548 2511
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 19599bded62a..3e5854625126 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -360,9 +360,6 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
360 bio->bi_bdev = bh->b_bdev; 360 bio->bi_bdev = bh->b_bdev;
361 bio->bi_end_io = ext4_end_bio; 361 bio->bi_end_io = ext4_end_bio;
362 bio->bi_private = ext4_get_io_end(io->io_end); 362 bio->bi_private = ext4_get_io_end(io->io_end);
363 if (!io->io_end->size)
364 io->io_end->offset = (bh->b_page->index << PAGE_CACHE_SHIFT)
365 + bh_offset(bh);
366 io->io_bio = bio; 363 io->io_bio = bio;
367 io->io_next_block = bh->b_blocknr; 364 io->io_next_block = bh->b_blocknr;
368 return 0; 365 return 0;
@@ -390,7 +387,6 @@ submit_and_retry:
390 io_end = io->io_end; 387 io_end = io->io_end;
391 if (test_clear_buffer_uninit(bh)) 388 if (test_clear_buffer_uninit(bh))
392 ext4_set_io_unwritten_flag(inode, io_end); 389 ext4_set_io_unwritten_flag(inode, io_end);
393 io_end->size += bh->b_size;
394 io->io_next_block++; 390 io->io_next_block++;
395 return 0; 391 return 0;
396} 392}