diff options
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 478 |
1 files changed, 321 insertions, 157 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 59fbbe899acc..7e91913e325b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -41,6 +41,8 @@ | |||
41 | #include "acl.h" | 41 | #include "acl.h" |
42 | #include "ext4_extents.h" | 42 | #include "ext4_extents.h" |
43 | 43 | ||
44 | #define MPAGE_DA_EXTENT_TAIL 0x01 | ||
45 | |||
44 | static inline int ext4_begin_ordered_truncate(struct inode *inode, | 46 | static inline int ext4_begin_ordered_truncate(struct inode *inode, |
45 | loff_t new_size) | 47 | loff_t new_size) |
46 | { | 48 | { |
@@ -1005,6 +1007,9 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) | |||
1005 | */ | 1007 | */ |
1006 | static int ext4_calc_metadata_amount(struct inode *inode, int blocks) | 1008 | static int ext4_calc_metadata_amount(struct inode *inode, int blocks) |
1007 | { | 1009 | { |
1010 | if (!blocks) | ||
1011 | return 0; | ||
1012 | |||
1008 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) | 1013 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) |
1009 | return ext4_ext_calc_metadata_amount(inode, blocks); | 1014 | return ext4_ext_calc_metadata_amount(inode, blocks); |
1010 | 1015 | ||
@@ -1041,18 +1046,6 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) | |||
1041 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1046 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); |
1042 | } | 1047 | } |
1043 | 1048 | ||
1044 | /* Maximum number of blocks we map for direct IO at once. */ | ||
1045 | #define DIO_MAX_BLOCKS 4096 | ||
1046 | /* | ||
1047 | * Number of credits we need for writing DIO_MAX_BLOCKS: | ||
1048 | * We need sb + group descriptor + bitmap + inode -> 4 | ||
1049 | * For B blocks with A block pointers per block we need: | ||
1050 | * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). | ||
1051 | * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. | ||
1052 | */ | ||
1053 | #define DIO_CREDITS 25 | ||
1054 | |||
1055 | |||
1056 | /* | 1049 | /* |
1057 | * The ext4_get_blocks_wrap() function try to look up the requested blocks, | 1050 | * The ext4_get_blocks_wrap() function try to look up the requested blocks, |
1058 | * and returns if the blocks are already mapped. | 1051 | * and returns if the blocks are already mapped. |
@@ -1164,19 +1157,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, | |||
1164 | return retval; | 1157 | return retval; |
1165 | } | 1158 | } |
1166 | 1159 | ||
1160 | /* Maximum number of blocks we map for direct IO at once. */ | ||
1161 | #define DIO_MAX_BLOCKS 4096 | ||
1162 | |||
1167 | static int ext4_get_block(struct inode *inode, sector_t iblock, | 1163 | static int ext4_get_block(struct inode *inode, sector_t iblock, |
1168 | struct buffer_head *bh_result, int create) | 1164 | struct buffer_head *bh_result, int create) |
1169 | { | 1165 | { |
1170 | handle_t *handle = ext4_journal_current_handle(); | 1166 | handle_t *handle = ext4_journal_current_handle(); |
1171 | int ret = 0, started = 0; | 1167 | int ret = 0, started = 0; |
1172 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | 1168 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; |
1169 | int dio_credits; | ||
1173 | 1170 | ||
1174 | if (create && !handle) { | 1171 | if (create && !handle) { |
1175 | /* Direct IO write... */ | 1172 | /* Direct IO write... */ |
1176 | if (max_blocks > DIO_MAX_BLOCKS) | 1173 | if (max_blocks > DIO_MAX_BLOCKS) |
1177 | max_blocks = DIO_MAX_BLOCKS; | 1174 | max_blocks = DIO_MAX_BLOCKS; |
1178 | handle = ext4_journal_start(inode, DIO_CREDITS + | 1175 | dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); |
1179 | 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); | 1176 | handle = ext4_journal_start(inode, dio_credits); |
1180 | if (IS_ERR(handle)) { | 1177 | if (IS_ERR(handle)) { |
1181 | ret = PTR_ERR(handle); | 1178 | ret = PTR_ERR(handle); |
1182 | goto out; | 1179 | goto out; |
@@ -1559,7 +1556,25 @@ static void ext4_da_release_space(struct inode *inode, int to_free) | |||
1559 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1556 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1560 | int total, mdb, mdb_free, release; | 1557 | int total, mdb, mdb_free, release; |
1561 | 1558 | ||
1559 | if (!to_free) | ||
1560 | return; /* Nothing to release, exit */ | ||
1561 | |||
1562 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | 1562 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); |
1563 | |||
1564 | if (!EXT4_I(inode)->i_reserved_data_blocks) { | ||
1565 | /* | ||
1566 | * if there is no reserved blocks, but we try to free some | ||
1567 | * then the counter is messed up somewhere. | ||
1568 | * but since this function is called from invalidate | ||
1569 | * page, it's harmless to return without any action | ||
1570 | */ | ||
1571 | printk(KERN_INFO "ext4 delalloc try to release %d reserved " | ||
1572 | "blocks for inode %lu, but there is no reserved " | ||
1573 | "data blocks\n", to_free, inode->i_ino); | ||
1574 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | ||
1575 | return; | ||
1576 | } | ||
1577 | |||
1563 | /* recalculate the number of metablocks still need to be reserved */ | 1578 | /* recalculate the number of metablocks still need to be reserved */ |
1564 | total = EXT4_I(inode)->i_reserved_data_blocks - to_free; | 1579 | total = EXT4_I(inode)->i_reserved_data_blocks - to_free; |
1565 | mdb = ext4_calc_metadata_amount(inode, total); | 1580 | mdb = ext4_calc_metadata_amount(inode, total); |
@@ -1613,11 +1628,13 @@ struct mpage_da_data { | |||
1613 | unsigned long first_page, next_page; /* extent of pages */ | 1628 | unsigned long first_page, next_page; /* extent of pages */ |
1614 | get_block_t *get_block; | 1629 | get_block_t *get_block; |
1615 | struct writeback_control *wbc; | 1630 | struct writeback_control *wbc; |
1631 | int io_done; | ||
1632 | long pages_written; | ||
1616 | }; | 1633 | }; |
1617 | 1634 | ||
1618 | /* | 1635 | /* |
1619 | * mpage_da_submit_io - walks through extent of pages and try to write | 1636 | * mpage_da_submit_io - walks through extent of pages and try to write |
1620 | * them with __mpage_writepage() | 1637 | * them with writepage() call back |
1621 | * | 1638 | * |
1622 | * @mpd->inode: inode | 1639 | * @mpd->inode: inode |
1623 | * @mpd->first_page: first page of the extent | 1640 | * @mpd->first_page: first page of the extent |
@@ -1632,18 +1649,11 @@ struct mpage_da_data { | |||
1632 | static int mpage_da_submit_io(struct mpage_da_data *mpd) | 1649 | static int mpage_da_submit_io(struct mpage_da_data *mpd) |
1633 | { | 1650 | { |
1634 | struct address_space *mapping = mpd->inode->i_mapping; | 1651 | struct address_space *mapping = mpd->inode->i_mapping; |
1635 | struct mpage_data mpd_pp = { | ||
1636 | .bio = NULL, | ||
1637 | .last_block_in_bio = 0, | ||
1638 | .get_block = mpd->get_block, | ||
1639 | .use_writepage = 1, | ||
1640 | }; | ||
1641 | int ret = 0, err, nr_pages, i; | 1652 | int ret = 0, err, nr_pages, i; |
1642 | unsigned long index, end; | 1653 | unsigned long index, end; |
1643 | struct pagevec pvec; | 1654 | struct pagevec pvec; |
1644 | 1655 | ||
1645 | BUG_ON(mpd->next_page <= mpd->first_page); | 1656 | BUG_ON(mpd->next_page <= mpd->first_page); |
1646 | |||
1647 | pagevec_init(&pvec, 0); | 1657 | pagevec_init(&pvec, 0); |
1648 | index = mpd->first_page; | 1658 | index = mpd->first_page; |
1649 | end = mpd->next_page - 1; | 1659 | end = mpd->next_page - 1; |
@@ -1661,8 +1671,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) | |||
1661 | break; | 1671 | break; |
1662 | index++; | 1672 | index++; |
1663 | 1673 | ||
1664 | err = __mpage_writepage(page, mpd->wbc, &mpd_pp); | 1674 | err = mapping->a_ops->writepage(page, mpd->wbc); |
1665 | 1675 | if (!err) | |
1676 | mpd->pages_written++; | ||
1666 | /* | 1677 | /* |
1667 | * In error case, we have to continue because | 1678 | * In error case, we have to continue because |
1668 | * remaining pages are still locked | 1679 | * remaining pages are still locked |
@@ -1673,9 +1684,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) | |||
1673 | } | 1684 | } |
1674 | pagevec_release(&pvec); | 1685 | pagevec_release(&pvec); |
1675 | } | 1686 | } |
1676 | if (mpd_pp.bio) | ||
1677 | mpage_bio_submit(WRITE, mpd_pp.bio); | ||
1678 | |||
1679 | return ret; | 1687 | return ret; |
1680 | } | 1688 | } |
1681 | 1689 | ||
@@ -1698,7 +1706,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, | |||
1698 | int blocks = exbh->b_size >> inode->i_blkbits; | 1706 | int blocks = exbh->b_size >> inode->i_blkbits; |
1699 | sector_t pblock = exbh->b_blocknr, cur_logical; | 1707 | sector_t pblock = exbh->b_blocknr, cur_logical; |
1700 | struct buffer_head *head, *bh; | 1708 | struct buffer_head *head, *bh; |
1701 | unsigned long index, end; | 1709 | pgoff_t index, end; |
1702 | struct pagevec pvec; | 1710 | struct pagevec pvec; |
1703 | int nr_pages, i; | 1711 | int nr_pages, i; |
1704 | 1712 | ||
@@ -1741,6 +1749,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, | |||
1741 | if (buffer_delay(bh)) { | 1749 | if (buffer_delay(bh)) { |
1742 | bh->b_blocknr = pblock; | 1750 | bh->b_blocknr = pblock; |
1743 | clear_buffer_delay(bh); | 1751 | clear_buffer_delay(bh); |
1752 | bh->b_bdev = inode->i_sb->s_bdev; | ||
1753 | } else if (buffer_unwritten(bh)) { | ||
1754 | bh->b_blocknr = pblock; | ||
1755 | clear_buffer_unwritten(bh); | ||
1756 | set_buffer_mapped(bh); | ||
1757 | set_buffer_new(bh); | ||
1758 | bh->b_bdev = inode->i_sb->s_bdev; | ||
1744 | } else if (buffer_mapped(bh)) | 1759 | } else if (buffer_mapped(bh)) |
1745 | BUG_ON(bh->b_blocknr != pblock); | 1760 | BUG_ON(bh->b_blocknr != pblock); |
1746 | 1761 | ||
@@ -1776,13 +1791,11 @@ static inline void __unmap_underlying_blocks(struct inode *inode, | |||
1776 | * | 1791 | * |
1777 | * The function skips space we know is already mapped to disk blocks. | 1792 | * The function skips space we know is already mapped to disk blocks. |
1778 | * | 1793 | * |
1779 | * The function ignores errors ->get_block() returns, thus real | ||
1780 | * error handling is postponed to __mpage_writepage() | ||
1781 | */ | 1794 | */ |
1782 | static void mpage_da_map_blocks(struct mpage_da_data *mpd) | 1795 | static void mpage_da_map_blocks(struct mpage_da_data *mpd) |
1783 | { | 1796 | { |
1797 | int err = 0; | ||
1784 | struct buffer_head *lbh = &mpd->lbh; | 1798 | struct buffer_head *lbh = &mpd->lbh; |
1785 | int err = 0, remain = lbh->b_size; | ||
1786 | sector_t next = lbh->b_blocknr; | 1799 | sector_t next = lbh->b_blocknr; |
1787 | struct buffer_head new; | 1800 | struct buffer_head new; |
1788 | 1801 | ||
@@ -1792,38 +1805,36 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) | |||
1792 | if (buffer_mapped(lbh) && !buffer_delay(lbh)) | 1805 | if (buffer_mapped(lbh) && !buffer_delay(lbh)) |
1793 | return; | 1806 | return; |
1794 | 1807 | ||
1795 | while (remain) { | 1808 | new.b_state = lbh->b_state; |
1796 | new.b_state = lbh->b_state; | 1809 | new.b_blocknr = 0; |
1797 | new.b_blocknr = 0; | 1810 | new.b_size = lbh->b_size; |
1798 | new.b_size = remain; | ||
1799 | err = mpd->get_block(mpd->inode, next, &new, 1); | ||
1800 | if (err) { | ||
1801 | /* | ||
1802 | * Rather than implement own error handling | ||
1803 | * here, we just leave remaining blocks | ||
1804 | * unallocated and try again with ->writepage() | ||
1805 | */ | ||
1806 | break; | ||
1807 | } | ||
1808 | BUG_ON(new.b_size == 0); | ||
1809 | 1811 | ||
1810 | if (buffer_new(&new)) | 1812 | /* |
1811 | __unmap_underlying_blocks(mpd->inode, &new); | 1813 | * If we didn't accumulate anything |
1814 | * to write simply return | ||
1815 | */ | ||
1816 | if (!new.b_size) | ||
1817 | return; | ||
1818 | err = mpd->get_block(mpd->inode, next, &new, 1); | ||
1819 | if (err) | ||
1820 | return; | ||
1821 | BUG_ON(new.b_size == 0); | ||
1812 | 1822 | ||
1813 | /* | 1823 | if (buffer_new(&new)) |
1814 | * If blocks are delayed marked, we need to | 1824 | __unmap_underlying_blocks(mpd->inode, &new); |
1815 | * put actual blocknr and drop delayed bit | ||
1816 | */ | ||
1817 | if (buffer_delay(lbh)) | ||
1818 | mpage_put_bnr_to_bhs(mpd, next, &new); | ||
1819 | 1825 | ||
1820 | /* go for the remaining blocks */ | 1826 | /* |
1821 | next += new.b_size >> mpd->inode->i_blkbits; | 1827 | * If blocks are delayed marked, we need to |
1822 | remain -= new.b_size; | 1828 | * put actual blocknr and drop delayed bit |
1823 | } | 1829 | */ |
1830 | if (buffer_delay(lbh) || buffer_unwritten(lbh)) | ||
1831 | mpage_put_bnr_to_bhs(mpd, next, &new); | ||
1832 | |||
1833 | return; | ||
1824 | } | 1834 | } |
1825 | 1835 | ||
1826 | #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) | 1836 | #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ |
1837 | (1 << BH_Delay) | (1 << BH_Unwritten)) | ||
1827 | 1838 | ||
1828 | /* | 1839 | /* |
1829 | * mpage_add_bh_to_extent - try to add one more block to extent of blocks | 1840 | * mpage_add_bh_to_extent - try to add one more block to extent of blocks |
@@ -1837,41 +1848,61 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) | |||
1837 | static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, | 1848 | static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, |
1838 | sector_t logical, struct buffer_head *bh) | 1849 | sector_t logical, struct buffer_head *bh) |
1839 | { | 1850 | { |
1840 | struct buffer_head *lbh = &mpd->lbh; | ||
1841 | sector_t next; | 1851 | sector_t next; |
1852 | size_t b_size = bh->b_size; | ||
1853 | struct buffer_head *lbh = &mpd->lbh; | ||
1854 | int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; | ||
1842 | 1855 | ||
1843 | next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); | 1856 | /* check if thereserved journal credits might overflow */ |
1844 | 1857 | if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { | |
1858 | if (nrblocks >= EXT4_MAX_TRANS_DATA) { | ||
1859 | /* | ||
1860 | * With non-extent format we are limited by the journal | ||
1861 | * credit available. Total credit needed to insert | ||
1862 | * nrblocks contiguous blocks is dependent on the | ||
1863 | * nrblocks. So limit nrblocks. | ||
1864 | */ | ||
1865 | goto flush_it; | ||
1866 | } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > | ||
1867 | EXT4_MAX_TRANS_DATA) { | ||
1868 | /* | ||
1869 | * Adding the new buffer_head would make it cross the | ||
1870 | * allowed limit for which we have journal credit | ||
1871 | * reserved. So limit the new bh->b_size | ||
1872 | */ | ||
1873 | b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << | ||
1874 | mpd->inode->i_blkbits; | ||
1875 | /* we will do mpage_da_submit_io in the next loop */ | ||
1876 | } | ||
1877 | } | ||
1845 | /* | 1878 | /* |
1846 | * First block in the extent | 1879 | * First block in the extent |
1847 | */ | 1880 | */ |
1848 | if (lbh->b_size == 0) { | 1881 | if (lbh->b_size == 0) { |
1849 | lbh->b_blocknr = logical; | 1882 | lbh->b_blocknr = logical; |
1850 | lbh->b_size = bh->b_size; | 1883 | lbh->b_size = b_size; |
1851 | lbh->b_state = bh->b_state & BH_FLAGS; | 1884 | lbh->b_state = bh->b_state & BH_FLAGS; |
1852 | return; | 1885 | return; |
1853 | } | 1886 | } |
1854 | 1887 | ||
1888 | next = lbh->b_blocknr + nrblocks; | ||
1855 | /* | 1889 | /* |
1856 | * Can we merge the block to our big extent? | 1890 | * Can we merge the block to our big extent? |
1857 | */ | 1891 | */ |
1858 | if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { | 1892 | if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { |
1859 | lbh->b_size += bh->b_size; | 1893 | lbh->b_size += b_size; |
1860 | return; | 1894 | return; |
1861 | } | 1895 | } |
1862 | 1896 | ||
1897 | flush_it: | ||
1863 | /* | 1898 | /* |
1864 | * We couldn't merge the block to our extent, so we | 1899 | * We couldn't merge the block to our extent, so we |
1865 | * need to flush current extent and start new one | 1900 | * need to flush current extent and start new one |
1866 | */ | 1901 | */ |
1867 | mpage_da_map_blocks(mpd); | 1902 | mpage_da_map_blocks(mpd); |
1868 | 1903 | mpage_da_submit_io(mpd); | |
1869 | /* | 1904 | mpd->io_done = 1; |
1870 | * Now start a new extent | 1905 | return; |
1871 | */ | ||
1872 | lbh->b_size = bh->b_size; | ||
1873 | lbh->b_state = bh->b_state & BH_FLAGS; | ||
1874 | lbh->b_blocknr = logical; | ||
1875 | } | 1906 | } |
1876 | 1907 | ||
1877 | /* | 1908 | /* |
@@ -1891,17 +1922,35 @@ static int __mpage_da_writepage(struct page *page, | |||
1891 | struct buffer_head *bh, *head, fake; | 1922 | struct buffer_head *bh, *head, fake; |
1892 | sector_t logical; | 1923 | sector_t logical; |
1893 | 1924 | ||
1925 | if (mpd->io_done) { | ||
1926 | /* | ||
1927 | * Rest of the page in the page_vec | ||
1928 | * redirty then and skip then. We will | ||
1929 | * try to to write them again after | ||
1930 | * starting a new transaction | ||
1931 | */ | ||
1932 | redirty_page_for_writepage(wbc, page); | ||
1933 | unlock_page(page); | ||
1934 | return MPAGE_DA_EXTENT_TAIL; | ||
1935 | } | ||
1894 | /* | 1936 | /* |
1895 | * Can we merge this page to current extent? | 1937 | * Can we merge this page to current extent? |
1896 | */ | 1938 | */ |
1897 | if (mpd->next_page != page->index) { | 1939 | if (mpd->next_page != page->index) { |
1898 | /* | 1940 | /* |
1899 | * Nope, we can't. So, we map non-allocated blocks | 1941 | * Nope, we can't. So, we map non-allocated blocks |
1900 | * and start IO on them using __mpage_writepage() | 1942 | * and start IO on them using writepage() |
1901 | */ | 1943 | */ |
1902 | if (mpd->next_page != mpd->first_page) { | 1944 | if (mpd->next_page != mpd->first_page) { |
1903 | mpage_da_map_blocks(mpd); | 1945 | mpage_da_map_blocks(mpd); |
1904 | mpage_da_submit_io(mpd); | 1946 | mpage_da_submit_io(mpd); |
1947 | /* | ||
1948 | * skip rest of the page in the page_vec | ||
1949 | */ | ||
1950 | mpd->io_done = 1; | ||
1951 | redirty_page_for_writepage(wbc, page); | ||
1952 | unlock_page(page); | ||
1953 | return MPAGE_DA_EXTENT_TAIL; | ||
1905 | } | 1954 | } |
1906 | 1955 | ||
1907 | /* | 1956 | /* |
@@ -1932,6 +1981,8 @@ static int __mpage_da_writepage(struct page *page, | |||
1932 | set_buffer_dirty(bh); | 1981 | set_buffer_dirty(bh); |
1933 | set_buffer_uptodate(bh); | 1982 | set_buffer_uptodate(bh); |
1934 | mpage_add_bh_to_extent(mpd, logical, bh); | 1983 | mpage_add_bh_to_extent(mpd, logical, bh); |
1984 | if (mpd->io_done) | ||
1985 | return MPAGE_DA_EXTENT_TAIL; | ||
1935 | } else { | 1986 | } else { |
1936 | /* | 1987 | /* |
1937 | * Page with regular buffer heads, just add all dirty ones | 1988 | * Page with regular buffer heads, just add all dirty ones |
@@ -1940,8 +1991,12 @@ static int __mpage_da_writepage(struct page *page, | |||
1940 | bh = head; | 1991 | bh = head; |
1941 | do { | 1992 | do { |
1942 | BUG_ON(buffer_locked(bh)); | 1993 | BUG_ON(buffer_locked(bh)); |
1943 | if (buffer_dirty(bh)) | 1994 | if (buffer_dirty(bh) && |
1995 | (!buffer_mapped(bh) || buffer_delay(bh))) { | ||
1944 | mpage_add_bh_to_extent(mpd, logical, bh); | 1996 | mpage_add_bh_to_extent(mpd, logical, bh); |
1997 | if (mpd->io_done) | ||
1998 | return MPAGE_DA_EXTENT_TAIL; | ||
1999 | } | ||
1945 | logical++; | 2000 | logical++; |
1946 | } while ((bh = bh->b_this_page) != head); | 2001 | } while ((bh = bh->b_this_page) != head); |
1947 | } | 2002 | } |
@@ -1960,22 +2015,13 @@ static int __mpage_da_writepage(struct page *page, | |||
1960 | * | 2015 | * |
1961 | * This is a library function, which implements the writepages() | 2016 | * This is a library function, which implements the writepages() |
1962 | * address_space_operation. | 2017 | * address_space_operation. |
1963 | * | ||
1964 | * In order to avoid duplication of logic that deals with partial pages, | ||
1965 | * multiple bio per page, etc, we find non-allocated blocks, allocate | ||
1966 | * them with minimal calls to ->get_block() and re-use __mpage_writepage() | ||
1967 | * | ||
1968 | * It's important that we call __mpage_writepage() only once for each | ||
1969 | * involved page, otherwise we'd have to implement more complicated logic | ||
1970 | * to deal with pages w/o PG_lock or w/ PG_writeback and so on. | ||
1971 | * | ||
1972 | * See comments to mpage_writepages() | ||
1973 | */ | 2018 | */ |
1974 | static int mpage_da_writepages(struct address_space *mapping, | 2019 | static int mpage_da_writepages(struct address_space *mapping, |
1975 | struct writeback_control *wbc, | 2020 | struct writeback_control *wbc, |
1976 | get_block_t get_block) | 2021 | get_block_t get_block) |
1977 | { | 2022 | { |
1978 | struct mpage_da_data mpd; | 2023 | struct mpage_da_data mpd; |
2024 | long to_write; | ||
1979 | int ret; | 2025 | int ret; |
1980 | 2026 | ||
1981 | if (!get_block) | 2027 | if (!get_block) |
@@ -1989,17 +2035,22 @@ static int mpage_da_writepages(struct address_space *mapping, | |||
1989 | mpd.first_page = 0; | 2035 | mpd.first_page = 0; |
1990 | mpd.next_page = 0; | 2036 | mpd.next_page = 0; |
1991 | mpd.get_block = get_block; | 2037 | mpd.get_block = get_block; |
2038 | mpd.io_done = 0; | ||
2039 | mpd.pages_written = 0; | ||
2040 | |||
2041 | to_write = wbc->nr_to_write; | ||
1992 | 2042 | ||
1993 | ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); | 2043 | ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); |
1994 | 2044 | ||
1995 | /* | 2045 | /* |
1996 | * Handle last extent of pages | 2046 | * Handle last extent of pages |
1997 | */ | 2047 | */ |
1998 | if (mpd.next_page != mpd.first_page) { | 2048 | if (!mpd.io_done && mpd.next_page != mpd.first_page) { |
1999 | mpage_da_map_blocks(&mpd); | 2049 | mpage_da_map_blocks(&mpd); |
2000 | mpage_da_submit_io(&mpd); | 2050 | mpage_da_submit_io(&mpd); |
2001 | } | 2051 | } |
2002 | 2052 | ||
2053 | wbc->nr_to_write = to_write - mpd.pages_written; | ||
2003 | return ret; | 2054 | return ret; |
2004 | } | 2055 | } |
2005 | 2056 | ||
@@ -2204,63 +2255,95 @@ static int ext4_da_writepage(struct page *page, | |||
2204 | } | 2255 | } |
2205 | 2256 | ||
2206 | /* | 2257 | /* |
2207 | * For now just follow the DIO way to estimate the max credits | 2258 | * This is called via ext4_da_writepages() to |
2208 | * needed to write out EXT4_MAX_WRITEBACK_PAGES. | 2259 | * calulate the total number of credits to reserve to fit |
2209 | * todo: need to calculate the max credits need for | 2260 | * a single extent allocation into a single transaction, |
2210 | * extent based files, currently the DIO credits is based on | 2261 | * ext4_da_writpeages() will loop calling this before |
2211 | * indirect-blocks mapping way. | 2262 | * the block allocation. |
2212 | * | ||
2213 | * Probably should have a generic way to calculate credits | ||
2214 | * for DIO, writepages, and truncate | ||
2215 | */ | 2263 | */ |
2216 | #define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS | 2264 | |
2217 | #define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS | 2265 | static int ext4_da_writepages_trans_blocks(struct inode *inode) |
2266 | { | ||
2267 | int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; | ||
2268 | |||
2269 | /* | ||
2270 | * With non-extent format the journal credit needed to | ||
2271 | * insert nrblocks contiguous block is dependent on | ||
2272 | * number of contiguous block. So we will limit | ||
2273 | * number of contiguous block to a sane value | ||
2274 | */ | ||
2275 | if (!(inode->i_flags & EXT4_EXTENTS_FL) && | ||
2276 | (max_blocks > EXT4_MAX_TRANS_DATA)) | ||
2277 | max_blocks = EXT4_MAX_TRANS_DATA; | ||
2278 | |||
2279 | return ext4_chunk_trans_blocks(inode, max_blocks); | ||
2280 | } | ||
2218 | 2281 | ||
2219 | static int ext4_da_writepages(struct address_space *mapping, | 2282 | static int ext4_da_writepages(struct address_space *mapping, |
2220 | struct writeback_control *wbc) | 2283 | struct writeback_control *wbc) |
2221 | { | 2284 | { |
2222 | struct inode *inode = mapping->host; | ||
2223 | handle_t *handle = NULL; | 2285 | handle_t *handle = NULL; |
2224 | int needed_blocks; | ||
2225 | int ret = 0; | ||
2226 | long to_write; | ||
2227 | loff_t range_start = 0; | 2286 | loff_t range_start = 0; |
2287 | struct inode *inode = mapping->host; | ||
2288 | int needed_blocks, ret = 0, nr_to_writebump = 0; | ||
2289 | long to_write, pages_skipped = 0; | ||
2290 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); | ||
2228 | 2291 | ||
2229 | /* | 2292 | /* |
2230 | * No pages to write? This is mainly a kludge to avoid starting | 2293 | * No pages to write? This is mainly a kludge to avoid starting |
2231 | * a transaction for special inodes like journal inode on last iput() | 2294 | * a transaction for special inodes like journal inode on last iput() |
2232 | * because that could violate lock ordering on umount | 2295 | * because that could violate lock ordering on umount |
2233 | */ | 2296 | */ |
2234 | if (!mapping->nrpages) | 2297 | if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) |
2235 | return 0; | 2298 | return 0; |
2236 | |||
2237 | /* | 2299 | /* |
2238 | * Estimate the worse case needed credits to write out | 2300 | * Make sure nr_to_write is >= sbi->s_mb_stream_request |
2239 | * EXT4_MAX_BUF_BLOCKS pages | 2301 | * This make sure small files blocks are allocated in |
2302 | * single attempt. This ensure that small files | ||
2303 | * get less fragmented. | ||
2240 | */ | 2304 | */ |
2241 | needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; | 2305 | if (wbc->nr_to_write < sbi->s_mb_stream_request) { |
2306 | nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; | ||
2307 | wbc->nr_to_write = sbi->s_mb_stream_request; | ||
2308 | } | ||
2242 | 2309 | ||
2243 | to_write = wbc->nr_to_write; | 2310 | if (!wbc->range_cyclic) |
2244 | if (!wbc->range_cyclic) { | ||
2245 | /* | 2311 | /* |
2246 | * If range_cyclic is not set force range_cont | 2312 | * If range_cyclic is not set force range_cont |
2247 | * and save the old writeback_index | 2313 | * and save the old writeback_index |
2248 | */ | 2314 | */ |
2249 | wbc->range_cont = 1; | 2315 | wbc->range_cont = 1; |
2250 | range_start = wbc->range_start; | ||
2251 | } | ||
2252 | 2316 | ||
2253 | while (!ret && to_write) { | 2317 | range_start = wbc->range_start; |
2318 | pages_skipped = wbc->pages_skipped; | ||
2319 | |||
2320 | restart_loop: | ||
2321 | to_write = wbc->nr_to_write; | ||
2322 | while (!ret && to_write > 0) { | ||
2323 | |||
2324 | /* | ||
2325 | * we insert one extent at a time. So we need | ||
2326 | * credit needed for single extent allocation. | ||
2327 | * journalled mode is currently not supported | ||
2328 | * by delalloc | ||
2329 | */ | ||
2330 | BUG_ON(ext4_should_journal_data(inode)); | ||
2331 | needed_blocks = ext4_da_writepages_trans_blocks(inode); | ||
2332 | |||
2254 | /* start a new transaction*/ | 2333 | /* start a new transaction*/ |
2255 | handle = ext4_journal_start(inode, needed_blocks); | 2334 | handle = ext4_journal_start(inode, needed_blocks); |
2256 | if (IS_ERR(handle)) { | 2335 | if (IS_ERR(handle)) { |
2257 | ret = PTR_ERR(handle); | 2336 | ret = PTR_ERR(handle); |
2337 | printk(KERN_EMERG "%s: jbd2_start: " | ||
2338 | "%ld pages, ino %lu; err %d\n", __func__, | ||
2339 | wbc->nr_to_write, inode->i_ino, ret); | ||
2340 | dump_stack(); | ||
2258 | goto out_writepages; | 2341 | goto out_writepages; |
2259 | } | 2342 | } |
2260 | if (ext4_should_order_data(inode)) { | 2343 | if (ext4_should_order_data(inode)) { |
2261 | /* | 2344 | /* |
2262 | * With ordered mode we need to add | 2345 | * With ordered mode we need to add |
2263 | * the inode to the journal handle | 2346 | * the inode to the journal handl |
2264 | * when we do block allocation. | 2347 | * when we do block allocation. |
2265 | */ | 2348 | */ |
2266 | ret = ext4_jbd2_file_inode(handle, inode); | 2349 | ret = ext4_jbd2_file_inode(handle, inode); |
@@ -2268,20 +2351,20 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2268 | ext4_journal_stop(handle); | 2351 | ext4_journal_stop(handle); |
2269 | goto out_writepages; | 2352 | goto out_writepages; |
2270 | } | 2353 | } |
2271 | |||
2272 | } | 2354 | } |
2273 | /* | ||
2274 | * set the max dirty pages could be write at a time | ||
2275 | * to fit into the reserved transaction credits | ||
2276 | */ | ||
2277 | if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) | ||
2278 | wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; | ||
2279 | 2355 | ||
2280 | to_write -= wbc->nr_to_write; | 2356 | to_write -= wbc->nr_to_write; |
2281 | ret = mpage_da_writepages(mapping, wbc, | 2357 | ret = mpage_da_writepages(mapping, wbc, |
2282 | ext4_da_get_block_write); | 2358 | ext4_da_get_block_write); |
2283 | ext4_journal_stop(handle); | 2359 | ext4_journal_stop(handle); |
2284 | if (wbc->nr_to_write) { | 2360 | if (ret == MPAGE_DA_EXTENT_TAIL) { |
2361 | /* | ||
2362 | * got one extent now try with | ||
2363 | * rest of the pages | ||
2364 | */ | ||
2365 | to_write += wbc->nr_to_write; | ||
2366 | ret = 0; | ||
2367 | } else if (wbc->nr_to_write) { | ||
2285 | /* | 2368 | /* |
2286 | * There is no more writeout needed | 2369 | * There is no more writeout needed |
2287 | * or we requested for a noblocking writeout | 2370 | * or we requested for a noblocking writeout |
@@ -2293,10 +2376,18 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2293 | wbc->nr_to_write = to_write; | 2376 | wbc->nr_to_write = to_write; |
2294 | } | 2377 | } |
2295 | 2378 | ||
2296 | out_writepages: | 2379 | if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { |
2297 | wbc->nr_to_write = to_write; | 2380 | /* We skipped pages in this loop */ |
2298 | if (range_start) | ||
2299 | wbc->range_start = range_start; | 2381 | wbc->range_start = range_start; |
2382 | wbc->nr_to_write = to_write + | ||
2383 | wbc->pages_skipped - pages_skipped; | ||
2384 | wbc->pages_skipped = pages_skipped; | ||
2385 | goto restart_loop; | ||
2386 | } | ||
2387 | |||
2388 | out_writepages: | ||
2389 | wbc->nr_to_write = to_write - nr_to_writebump; | ||
2390 | wbc->range_start = range_start; | ||
2300 | return ret; | 2391 | return ret; |
2301 | } | 2392 | } |
2302 | 2393 | ||
@@ -3486,6 +3577,9 @@ void ext4_truncate(struct inode *inode) | |||
3486 | * modify the block allocation tree. | 3577 | * modify the block allocation tree. |
3487 | */ | 3578 | */ |
3488 | down_write(&ei->i_data_sem); | 3579 | down_write(&ei->i_data_sem); |
3580 | |||
3581 | ext4_discard_reservation(inode); | ||
3582 | |||
3489 | /* | 3583 | /* |
3490 | * The orphan list entry will now protect us from any crash which | 3584 | * The orphan list entry will now protect us from any crash which |
3491 | * occurs before the truncate completes, so it is now safe to propagate | 3585 | * occurs before the truncate completes, so it is now safe to propagate |
@@ -3555,8 +3649,6 @@ do_indirects: | |||
3555 | ; | 3649 | ; |
3556 | } | 3650 | } |
3557 | 3651 | ||
3558 | ext4_discard_reservation(inode); | ||
3559 | |||
3560 | up_write(&ei->i_data_sem); | 3652 | up_write(&ei->i_data_sem); |
3561 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | 3653 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); |
3562 | ext4_mark_inode_dirty(handle, inode); | 3654 | ext4_mark_inode_dirty(handle, inode); |
@@ -4324,57 +4416,129 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | |||
4324 | return 0; | 4416 | return 0; |
4325 | } | 4417 | } |
4326 | 4418 | ||
4419 | static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, | ||
4420 | int chunk) | ||
4421 | { | ||
4422 | int indirects; | ||
4423 | |||
4424 | /* if nrblocks are contiguous */ | ||
4425 | if (chunk) { | ||
4426 | /* | ||
4427 | * With N contiguous data blocks, it need at most | ||
4428 | * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks | ||
4429 | * 2 dindirect blocks | ||
4430 | * 1 tindirect block | ||
4431 | */ | ||
4432 | indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
4433 | return indirects + 3; | ||
4434 | } | ||
4435 | /* | ||
4436 | * if nrblocks are not contiguous, worse case, each block touch | ||
4437 | * a indirect block, and each indirect block touch a double indirect | ||
4438 | * block, plus a triple indirect block | ||
4439 | */ | ||
4440 | indirects = nrblocks * 2 + 1; | ||
4441 | return indirects; | ||
4442 | } | ||
4443 | |||
4444 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | ||
4445 | { | ||
4446 | if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) | ||
4447 | return ext4_indirect_trans_blocks(inode, nrblocks, 0); | ||
4448 | return ext4_ext_index_trans_blocks(inode, nrblocks, 0); | ||
4449 | } | ||
4327 | /* | 4450 | /* |
4328 | * How many blocks doth make a writepage()? | 4451 | * Account for index blocks, block groups bitmaps and block group |
4329 | * | 4452 | * descriptor blocks if modify datablocks and index blocks |
4330 | * With N blocks per page, it may be: | 4453 | * worse case, the indexs blocks spread over different block groups |
4331 | * N data blocks | ||
4332 | * 2 indirect block | ||
4333 | * 2 dindirect | ||
4334 | * 1 tindirect | ||
4335 | * N+5 bitmap blocks (from the above) | ||
4336 | * N+5 group descriptor summary blocks | ||
4337 | * 1 inode block | ||
4338 | * 1 superblock. | ||
4339 | * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files | ||
4340 | * | 4454 | * |
4341 | * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS | 4455 | * If datablocks are discontiguous, they are possible to spread over |
4456 | * different block groups too. If they are contiugous, with flexbg, | ||
4457 | * they could still across block group boundary. | ||
4342 | * | 4458 | * |
4343 | * With ordered or writeback data it's the same, less the N data blocks. | 4459 | * Also account for superblock, inode, quota and xattr blocks |
4460 | */ | ||
4461 | int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) | ||
4462 | { | ||
4463 | int groups, gdpblocks; | ||
4464 | int idxblocks; | ||
4465 | int ret = 0; | ||
4466 | |||
4467 | /* | ||
4468 | * How many index blocks need to touch to modify nrblocks? | ||
4469 | * The "Chunk" flag indicating whether the nrblocks is | ||
4470 | * physically contiguous on disk | ||
4471 | * | ||
4472 | * For Direct IO and fallocate, they calls get_block to allocate | ||
4473 | * one single extent at a time, so they could set the "Chunk" flag | ||
4474 | */ | ||
4475 | idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); | ||
4476 | |||
4477 | ret = idxblocks; | ||
4478 | |||
4479 | /* | ||
4480 | * Now let's see how many group bitmaps and group descriptors need | ||
4481 | * to account | ||
4482 | */ | ||
4483 | groups = idxblocks; | ||
4484 | if (chunk) | ||
4485 | groups += 1; | ||
4486 | else | ||
4487 | groups += nrblocks; | ||
4488 | |||
4489 | gdpblocks = groups; | ||
4490 | if (groups > EXT4_SB(inode->i_sb)->s_groups_count) | ||
4491 | groups = EXT4_SB(inode->i_sb)->s_groups_count; | ||
4492 | if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) | ||
4493 | gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; | ||
4494 | |||
4495 | /* bitmaps and block group descriptor blocks */ | ||
4496 | ret += groups + gdpblocks; | ||
4497 | |||
4498 | /* Blocks for super block, inode, quota and xattr blocks */ | ||
4499 | ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); | ||
4500 | |||
4501 | return ret; | ||
4502 | } | ||
4503 | |||
4504 | /* | ||
4505 | * Calulate the total number of credits to reserve to fit | ||
4506 | * the modification of a single pages into a single transaction, | ||
4507 | * which may include multiple chunks of block allocations. | ||
4344 | * | 4508 | * |
4345 | * If the inode's direct blocks can hold an integral number of pages then a | 4509 | * This could be called via ext4_write_begin() |
4346 | * page cannot straddle two indirect blocks, and we can only touch one indirect | ||
4347 | * and dindirect block, and the "5" above becomes "3". | ||
4348 | * | 4510 | * |
4349 | * This still overestimates under most circumstances. If we were to pass the | 4511 | * We need to consider the worse case, when |
4350 | * start and end offsets in here as well we could do block_to_path() on each | 4512 | * one new block per extent. |
4351 | * block and work out the exact number of indirects which are touched. Pah. | ||
4352 | */ | 4513 | */ |
4353 | |||
4354 | int ext4_writepage_trans_blocks(struct inode *inode) | 4514 | int ext4_writepage_trans_blocks(struct inode *inode) |
4355 | { | 4515 | { |
4356 | int bpp = ext4_journal_blocks_per_page(inode); | 4516 | int bpp = ext4_journal_blocks_per_page(inode); |
4357 | int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3; | ||
4358 | int ret; | 4517 | int ret; |
4359 | 4518 | ||
4360 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) | 4519 | ret = ext4_meta_trans_blocks(inode, bpp, 0); |
4361 | return ext4_ext_writepage_trans_blocks(inode, bpp); | ||
4362 | 4520 | ||
4521 | /* Account for data blocks for journalled mode */ | ||
4363 | if (ext4_should_journal_data(inode)) | 4522 | if (ext4_should_journal_data(inode)) |
4364 | ret = 3 * (bpp + indirects) + 2; | 4523 | ret += bpp; |
4365 | else | ||
4366 | ret = 2 * (bpp + indirects) + 2; | ||
4367 | |||
4368 | #ifdef CONFIG_QUOTA | ||
4369 | /* We know that structure was already allocated during DQUOT_INIT so | ||
4370 | * we will be updating only the data blocks + inodes */ | ||
4371 | ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); | ||
4372 | #endif | ||
4373 | |||
4374 | return ret; | 4524 | return ret; |
4375 | } | 4525 | } |
4376 | 4526 | ||
4377 | /* | 4527 | /* |
4528 | * Calculate the journal credits for a chunk of data modification. | ||
4529 | * | ||
4530 | * This is called from DIO, fallocate or whoever calling | ||
4531 | * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. | ||
4532 | * | ||
4533 | * journal buffers for data blocks are not included here, as DIO | ||
4534 | * and fallocate do no need to journal data buffers. | ||
4535 | */ | ||
4536 | int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) | ||
4537 | { | ||
4538 | return ext4_meta_trans_blocks(inode, nrblocks, 1); | ||
4539 | } | ||
4540 | |||
4541 | /* | ||
4378 | * The caller must have previously called ext4_reserve_inode_write(). | 4542 | * The caller must have previously called ext4_reserve_inode_write(). |
4379 | * Give this, we know that the caller already has write access to iloc->bh. | 4543 | * Give this, we know that the caller already has write access to iloc->bh. |
4380 | */ | 4544 | */ |