aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c467
1 files changed, 301 insertions, 166 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c7fed5b18745..a2e7952bc5f9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -371,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode,
371 return n; 371 return n;
372} 372}
373 373
374static int __ext4_check_blockref(const char *function, struct inode *inode,
375 unsigned int *p, unsigned int max) {
376
377 unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
378 unsigned int *bref = p;
379 while (bref < p+max) {
380 if (unlikely(*bref >= maxblocks)) {
381 ext4_error(inode->i_sb, function,
382 "block reference %u >= max (%u) "
383 "in inode #%lu, offset=%d",
384 *bref, maxblocks,
385 inode->i_ino, (int)(bref-p));
386 return -EIO;
387 }
388 bref++;
389 }
390 return 0;
391}
392
393
394#define ext4_check_indirect_blockref(inode, bh) \
395 __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \
396 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
397
398#define ext4_check_inode_blockref(inode) \
399 __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \
400 EXT4_NDIR_BLOCKS)
401
374/** 402/**
375 * ext4_get_branch - read the chain of indirect blocks leading to data 403 * ext4_get_branch - read the chain of indirect blocks leading to data
376 * @inode: inode in question 404 * @inode: inode in question
@@ -415,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
415 if (!p->key) 443 if (!p->key)
416 goto no_block; 444 goto no_block;
417 while (--depth) { 445 while (--depth) {
418 bh = sb_bread(sb, le32_to_cpu(p->key)); 446 bh = sb_getblk(sb, le32_to_cpu(p->key));
419 if (!bh) 447 if (unlikely(!bh))
420 goto failure; 448 goto failure;
449
450 if (!bh_uptodate_or_lock(bh)) {
451 if (bh_submit_read(bh) < 0) {
452 put_bh(bh);
453 goto failure;
454 }
455 /* validate block references */
456 if (ext4_check_indirect_blockref(inode, bh)) {
457 put_bh(bh);
458 goto failure;
459 }
460 }
461
421 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 462 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
422 /* Reader: end */ 463 /* Reader: end */
423 if (!p->key) 464 if (!p->key)
@@ -459,6 +500,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
459 ext4_fsblk_t bg_start; 500 ext4_fsblk_t bg_start;
460 ext4_fsblk_t last_block; 501 ext4_fsblk_t last_block;
461 ext4_grpblk_t colour; 502 ext4_grpblk_t colour;
503 ext4_group_t block_group;
504 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
462 505
463 /* Try to find previous block */ 506 /* Try to find previous block */
464 for (p = ind->p - 1; p >= start; p--) { 507 for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +517,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
474 * It is going to be referred to from the inode itself? OK, just put it 517 * It is going to be referred to from the inode itself? OK, just put it
475 * into the same cylinder group then. 518 * into the same cylinder group then.
476 */ 519 */
477 bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); 520 block_group = ei->i_block_group;
521 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
522 block_group &= ~(flex_size-1);
523 if (S_ISREG(inode->i_mode))
524 block_group++;
525 }
526 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
478 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 527 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
479 528
529 /*
530 * If we are doing delayed allocation, we don't need take
531 * colour into account.
532 */
533 if (test_opt(inode->i_sb, DELALLOC))
534 return bg_start;
535
480 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 536 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
481 colour = (current->pid % 16) * 537 colour = (current->pid % 16) *
482 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 538 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -975,6 +1031,17 @@ out:
975 return err; 1031 return err;
976} 1032}
977 1033
1034qsize_t ext4_get_reserved_space(struct inode *inode)
1035{
1036 unsigned long long total;
1037
1038 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1039 total = EXT4_I(inode)->i_reserved_data_blocks +
1040 EXT4_I(inode)->i_reserved_meta_blocks;
1041 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1042
1043 return total;
1044}
978/* 1045/*
979 * Calculate the number of metadata blocks need to reserve 1046 * Calculate the number of metadata blocks need to reserve
980 * to allocate @blocks for non extent file based file 1047 * to allocate @blocks for non extent file based file
@@ -1036,8 +1103,21 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1036 /* update per-inode reservations */ 1103 /* update per-inode reservations */
1037 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); 1104 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
1038 EXT4_I(inode)->i_reserved_data_blocks -= used; 1105 EXT4_I(inode)->i_reserved_data_blocks -= used;
1039
1040 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1106 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1107
1108 /*
1109 * free those over-booking quota for metadata blocks
1110 */
1111 if (mdb_free)
1112 vfs_dq_release_reservation_block(inode, mdb_free);
1113
1114 /*
1115 * If we have done all the pending block allocations and if
1116 * there aren't any writers on the inode, we can discard the
1117 * inode's preallocations.
1118 */
1119 if (!total && (atomic_read(&inode->i_writecount) == 0))
1120 ext4_discard_preallocations(inode);
1041} 1121}
1042 1122
1043/* 1123/*
@@ -1553,8 +1633,8 @@ static int ext4_journalled_write_end(struct file *file,
1553static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1633static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1554{ 1634{
1555 int retries = 0; 1635 int retries = 0;
1556 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1636 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1557 unsigned long md_needed, mdblocks, total = 0; 1637 unsigned long md_needed, mdblocks, total = 0;
1558 1638
1559 /* 1639 /*
1560 * recalculate the amount of metadata blocks to reserve 1640 * recalculate the amount of metadata blocks to reserve
@@ -1570,12 +1650,23 @@ repeat:
1570 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; 1650 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1571 total = md_needed + nrblocks; 1651 total = md_needed + nrblocks;
1572 1652
1653 /*
1654 * Make quota reservation here to prevent quota overflow
1655 * later. Real quota accounting is done at pages writeout
1656 * time.
1657 */
1658 if (vfs_dq_reserve_block(inode, total)) {
1659 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1660 return -EDQUOT;
1661 }
1662
1573 if (ext4_claim_free_blocks(sbi, total)) { 1663 if (ext4_claim_free_blocks(sbi, total)) {
1574 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1664 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1575 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1665 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1576 yield(); 1666 yield();
1577 goto repeat; 1667 goto repeat;
1578 } 1668 }
1669 vfs_dq_release_reservation_block(inode, total);
1579 return -ENOSPC; 1670 return -ENOSPC;
1580 } 1671 }
1581 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1672 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -1629,6 +1720,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1629 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1720 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1630 EXT4_I(inode)->i_reserved_meta_blocks = mdb; 1721 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1631 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1722 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1723
1724 vfs_dq_release_reservation_block(inode, release);
1632} 1725}
1633 1726
1634static void ext4_da_page_release_reservation(struct page *page, 1727static void ext4_da_page_release_reservation(struct page *page,
@@ -1658,9 +1751,10 @@ static void ext4_da_page_release_reservation(struct page *page,
1658 1751
1659struct mpage_da_data { 1752struct mpage_da_data {
1660 struct inode *inode; 1753 struct inode *inode;
1661 struct buffer_head lbh; /* extent of blocks */ 1754 sector_t b_blocknr; /* start block number of extent */
1755 size_t b_size; /* size of extent */
1756 unsigned long b_state; /* state of the extent */
1662 unsigned long first_page, next_page; /* extent of pages */ 1757 unsigned long first_page, next_page; /* extent of pages */
1663 get_block_t *get_block;
1664 struct writeback_control *wbc; 1758 struct writeback_control *wbc;
1665 int io_done; 1759 int io_done;
1666 int pages_written; 1760 int pages_written;
@@ -1674,7 +1768,6 @@ struct mpage_da_data {
1674 * @mpd->inode: inode 1768 * @mpd->inode: inode
1675 * @mpd->first_page: first page of the extent 1769 * @mpd->first_page: first page of the extent
1676 * @mpd->next_page: page after the last page of the extent 1770 * @mpd->next_page: page after the last page of the extent
1677 * @mpd->get_block: the filesystem's block mapper function
1678 * 1771 *
1679 * By the time mpage_da_submit_io() is called we expect all blocks 1772 * By the time mpage_da_submit_io() is called we expect all blocks
1680 * to be allocated. this may be wrong if allocation failed. 1773 * to be allocated. this may be wrong if allocation failed.
@@ -1694,7 +1787,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
1694 /* 1787 /*
1695 * We need to start from the first_page to the next_page - 1 1788 * We need to start from the first_page to the next_page - 1
1696 * to make sure we also write the mapped dirty buffer_heads. 1789 * to make sure we also write the mapped dirty buffer_heads.
1697 * If we look at mpd->lbh.b_blocknr we would only be looking 1790 * If we look at mpd->b_blocknr we would only be looking
1698 * at the currently mapped buffer_heads. 1791 * at the currently mapped buffer_heads.
1699 */ 1792 */
1700 index = mpd->first_page; 1793 index = mpd->first_page;
@@ -1884,68 +1977,111 @@ static void ext4_print_free_blocks(struct inode *inode)
1884 return; 1977 return;
1885} 1978}
1886 1979
1980#define EXT4_DELALLOC_RSVED 1
1981static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
1982 struct buffer_head *bh_result, int create)
1983{
1984 int ret;
1985 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1986 loff_t disksize = EXT4_I(inode)->i_disksize;
1987 handle_t *handle = NULL;
1988
1989 handle = ext4_journal_current_handle();
1990 BUG_ON(!handle);
1991 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
1992 bh_result, create, 0, EXT4_DELALLOC_RSVED);
1993 if (ret <= 0)
1994 return ret;
1995
1996 bh_result->b_size = (ret << inode->i_blkbits);
1997
1998 if (ext4_should_order_data(inode)) {
1999 int retval;
2000 retval = ext4_jbd2_file_inode(handle, inode);
2001 if (retval)
2002 /*
2003 * Failed to add inode for ordered mode. Don't
2004 * update file size
2005 */
2006 return retval;
2007 }
2008
2009 /*
2010 * Update on-disk size along with block allocation we don't
2011 * use 'extend_disksize' as size may change within already
2012 * allocated block -bzzz
2013 */
2014 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2015 if (disksize > i_size_read(inode))
2016 disksize = i_size_read(inode);
2017 if (disksize > EXT4_I(inode)->i_disksize) {
2018 ext4_update_i_disksize(inode, disksize);
2019 ret = ext4_mark_inode_dirty(handle, inode);
2020 return ret;
2021 }
2022 return 0;
2023}
2024
1887/* 2025/*
1888 * mpage_da_map_blocks - go through given space 2026 * mpage_da_map_blocks - go through given space
1889 * 2027 *
1890 * @mpd->lbh - bh describing space 2028 * @mpd - bh describing space
1891 * @mpd->get_block - the filesystem's block mapper function
1892 * 2029 *
1893 * The function skips space we know is already mapped to disk blocks. 2030 * The function skips space we know is already mapped to disk blocks.
1894 * 2031 *
1895 */ 2032 */
1896static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2033static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1897{ 2034{
1898 int err = 0; 2035 int err = 0;
1899 struct buffer_head new; 2036 struct buffer_head new;
1900 struct buffer_head *lbh = &mpd->lbh;
1901 sector_t next; 2037 sector_t next;
1902 2038
1903 /* 2039 /*
1904 * We consider only non-mapped and non-allocated blocks 2040 * We consider only non-mapped and non-allocated blocks
1905 */ 2041 */
1906 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 2042 if ((mpd->b_state & (1 << BH_Mapped)) &&
2043 !(mpd->b_state & (1 << BH_Delay)))
1907 return 0; 2044 return 0;
1908 new.b_state = lbh->b_state; 2045 new.b_state = mpd->b_state;
1909 new.b_blocknr = 0; 2046 new.b_blocknr = 0;
1910 new.b_size = lbh->b_size; 2047 new.b_size = mpd->b_size;
1911 next = lbh->b_blocknr; 2048 next = mpd->b_blocknr;
1912 /* 2049 /*
1913 * If we didn't accumulate anything 2050 * If we didn't accumulate anything
1914 * to write simply return 2051 * to write simply return
1915 */ 2052 */
1916 if (!new.b_size) 2053 if (!new.b_size)
1917 return 0; 2054 return 0;
1918 err = mpd->get_block(mpd->inode, next, &new, 1);
1919 if (err) {
1920 2055
1921 /* If get block returns with error 2056 err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
1922 * we simply return. Later writepage 2057 if (err) {
1923 * will redirty the page and writepages 2058 /*
1924 * will find the dirty page again 2059 * If get block returns with error we simply
2060 * return. Later writepage will redirty the page and
2061 * writepages will find the dirty page again
1925 */ 2062 */
1926 if (err == -EAGAIN) 2063 if (err == -EAGAIN)
1927 return 0; 2064 return 0;
1928 2065
1929 if (err == -ENOSPC && 2066 if (err == -ENOSPC &&
1930 ext4_count_free_blocks(mpd->inode->i_sb)) { 2067 ext4_count_free_blocks(mpd->inode->i_sb)) {
1931 mpd->retval = err; 2068 mpd->retval = err;
1932 return 0; 2069 return 0;
1933 } 2070 }
1934 2071
1935 /* 2072 /*
1936 * get block failure will cause us 2073 * get block failure will cause us to loop in
1937 * to loop in writepages. Because 2074 * writepages, because a_ops->writepage won't be able
1938 * a_ops->writepage won't be able to 2075 * to make progress. The page will be redirtied by
1939 * make progress. The page will be redirtied 2076 * writepage and writepages will again try to write
1940 * by writepage and writepages will again 2077 * the same.
1941 * try to write the same.
1942 */ 2078 */
1943 printk(KERN_EMERG "%s block allocation failed for inode %lu " 2079 printk(KERN_EMERG "%s block allocation failed for inode %lu "
1944 "at logical offset %llu with max blocks " 2080 "at logical offset %llu with max blocks "
1945 "%zd with error %d\n", 2081 "%zd with error %d\n",
1946 __func__, mpd->inode->i_ino, 2082 __func__, mpd->inode->i_ino,
1947 (unsigned long long)next, 2083 (unsigned long long)next,
1948 lbh->b_size >> mpd->inode->i_blkbits, err); 2084 mpd->b_size >> mpd->inode->i_blkbits, err);
1949 printk(KERN_EMERG "This should not happen.!! " 2085 printk(KERN_EMERG "This should not happen.!! "
1950 "Data will be lost\n"); 2086 "Data will be lost\n");
1951 if (err == -ENOSPC) { 2087 if (err == -ENOSPC) {
@@ -1953,7 +2089,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1953 } 2089 }
1954 /* invlaidate all the pages */ 2090 /* invlaidate all the pages */
1955 ext4_da_block_invalidatepages(mpd, next, 2091 ext4_da_block_invalidatepages(mpd, next,
1956 lbh->b_size >> mpd->inode->i_blkbits); 2092 mpd->b_size >> mpd->inode->i_blkbits);
1957 return err; 2093 return err;
1958 } 2094 }
1959 BUG_ON(new.b_size == 0); 2095 BUG_ON(new.b_size == 0);
@@ -1965,7 +2101,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1965 * If blocks are delayed marked, we need to 2101 * If blocks are delayed marked, we need to
1966 * put actual blocknr and drop delayed bit 2102 * put actual blocknr and drop delayed bit
1967 */ 2103 */
1968 if (buffer_delay(lbh) || buffer_unwritten(lbh)) 2104 if ((mpd->b_state & (1 << BH_Delay)) ||
2105 (mpd->b_state & (1 << BH_Unwritten)))
1969 mpage_put_bnr_to_bhs(mpd, next, &new); 2106 mpage_put_bnr_to_bhs(mpd, next, &new);
1970 2107
1971 return 0; 2108 return 0;
@@ -1984,12 +2121,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
1984 * the function is used to collect contig. blocks in same state 2121 * the function is used to collect contig. blocks in same state
1985 */ 2122 */
1986static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 2123static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1987 sector_t logical, struct buffer_head *bh) 2124 sector_t logical, size_t b_size,
2125 unsigned long b_state)
1988{ 2126{
1989 sector_t next; 2127 sector_t next;
1990 size_t b_size = bh->b_size; 2128 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
1991 struct buffer_head *lbh = &mpd->lbh;
1992 int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
1993 2129
1994 /* check if thereserved journal credits might overflow */ 2130 /* check if thereserved journal credits might overflow */
1995 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 2131 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
@@ -2016,19 +2152,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2016 /* 2152 /*
2017 * First block in the extent 2153 * First block in the extent
2018 */ 2154 */
2019 if (lbh->b_size == 0) { 2155 if (mpd->b_size == 0) {
2020 lbh->b_blocknr = logical; 2156 mpd->b_blocknr = logical;
2021 lbh->b_size = b_size; 2157 mpd->b_size = b_size;
2022 lbh->b_state = bh->b_state & BH_FLAGS; 2158 mpd->b_state = b_state & BH_FLAGS;
2023 return; 2159 return;
2024 } 2160 }
2025 2161
2026 next = lbh->b_blocknr + nrblocks; 2162 next = mpd->b_blocknr + nrblocks;
2027 /* 2163 /*
2028 * Can we merge the block to our big extent? 2164 * Can we merge the block to our big extent?
2029 */ 2165 */
2030 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { 2166 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
2031 lbh->b_size += b_size; 2167 mpd->b_size += b_size;
2032 return; 2168 return;
2033 } 2169 }
2034 2170
@@ -2057,7 +2193,7 @@ static int __mpage_da_writepage(struct page *page,
2057{ 2193{
2058 struct mpage_da_data *mpd = data; 2194 struct mpage_da_data *mpd = data;
2059 struct inode *inode = mpd->inode; 2195 struct inode *inode = mpd->inode;
2060 struct buffer_head *bh, *head, fake; 2196 struct buffer_head *bh, *head;
2061 sector_t logical; 2197 sector_t logical;
2062 2198
2063 if (mpd->io_done) { 2199 if (mpd->io_done) {
@@ -2099,9 +2235,9 @@ static int __mpage_da_writepage(struct page *page,
2099 /* 2235 /*
2100 * ... and blocks 2236 * ... and blocks
2101 */ 2237 */
2102 mpd->lbh.b_size = 0; 2238 mpd->b_size = 0;
2103 mpd->lbh.b_state = 0; 2239 mpd->b_state = 0;
2104 mpd->lbh.b_blocknr = 0; 2240 mpd->b_blocknr = 0;
2105 } 2241 }
2106 2242
2107 mpd->next_page = page->index + 1; 2243 mpd->next_page = page->index + 1;
@@ -2109,16 +2245,8 @@ static int __mpage_da_writepage(struct page *page,
2109 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2245 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2110 2246
2111 if (!page_has_buffers(page)) { 2247 if (!page_has_buffers(page)) {
2112 /* 2248 mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
2113 * There is no attached buffer heads yet (mmap?) 2249 (1 << BH_Dirty) | (1 << BH_Uptodate));
2114 * we treat the page asfull of dirty blocks
2115 */
2116 bh = &fake;
2117 bh->b_size = PAGE_CACHE_SIZE;
2118 bh->b_state = 0;
2119 set_buffer_dirty(bh);
2120 set_buffer_uptodate(bh);
2121 mpage_add_bh_to_extent(mpd, logical, bh);
2122 if (mpd->io_done) 2250 if (mpd->io_done)
2123 return MPAGE_DA_EXTENT_TAIL; 2251 return MPAGE_DA_EXTENT_TAIL;
2124 } else { 2252 } else {
@@ -2136,8 +2264,10 @@ static int __mpage_da_writepage(struct page *page,
2136 * with the page in ext4_da_writepage 2264 * with the page in ext4_da_writepage
2137 */ 2265 */
2138 if (buffer_dirty(bh) && 2266 if (buffer_dirty(bh) &&
2139 (!buffer_mapped(bh) || buffer_delay(bh))) { 2267 (!buffer_mapped(bh) || buffer_delay(bh))) {
2140 mpage_add_bh_to_extent(mpd, logical, bh); 2268 mpage_add_bh_to_extent(mpd, logical,
2269 bh->b_size,
2270 bh->b_state);
2141 if (mpd->io_done) 2271 if (mpd->io_done)
2142 return MPAGE_DA_EXTENT_TAIL; 2272 return MPAGE_DA_EXTENT_TAIL;
2143 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { 2273 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
@@ -2149,9 +2279,8 @@ static int __mpage_da_writepage(struct page *page,
2149 * unmapped buffer_head later we need to 2279 * unmapped buffer_head later we need to
2150 * use the b_state flag of that buffer_head. 2280 * use the b_state flag of that buffer_head.
2151 */ 2281 */
2152 if (mpd->lbh.b_size == 0) 2282 if (mpd->b_size == 0)
2153 mpd->lbh.b_state = 2283 mpd->b_state = bh->b_state & BH_FLAGS;
2154 bh->b_state & BH_FLAGS;
2155 } 2284 }
2156 logical++; 2285 logical++;
2157 } while ((bh = bh->b_this_page) != head); 2286 } while ((bh = bh->b_this_page) != head);
@@ -2161,51 +2290,6 @@ static int __mpage_da_writepage(struct page *page,
2161} 2290}
2162 2291
2163/* 2292/*
2164 * mpage_da_writepages - walk the list of dirty pages of the given
2165 * address space, allocates non-allocated blocks, maps newly-allocated
2166 * blocks to existing bhs and issue IO them
2167 *
2168 * @mapping: address space structure to write
2169 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2170 * @get_block: the filesystem's block mapper function.
2171 *
2172 * This is a library function, which implements the writepages()
2173 * address_space_operation.
2174 */
2175static int mpage_da_writepages(struct address_space *mapping,
2176 struct writeback_control *wbc,
2177 struct mpage_da_data *mpd)
2178{
2179 int ret;
2180
2181 if (!mpd->get_block)
2182 return generic_writepages(mapping, wbc);
2183
2184 mpd->lbh.b_size = 0;
2185 mpd->lbh.b_state = 0;
2186 mpd->lbh.b_blocknr = 0;
2187 mpd->first_page = 0;
2188 mpd->next_page = 0;
2189 mpd->io_done = 0;
2190 mpd->pages_written = 0;
2191 mpd->retval = 0;
2192
2193 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
2194 /*
2195 * Handle last extent of pages
2196 */
2197 if (!mpd->io_done && mpd->next_page != mpd->first_page) {
2198 if (mpage_da_map_blocks(mpd) == 0)
2199 mpage_da_submit_io(mpd);
2200
2201 mpd->io_done = 1;
2202 ret = MPAGE_DA_EXTENT_TAIL;
2203 }
2204 wbc->nr_to_write -= mpd->pages_written;
2205 return ret;
2206}
2207
2208/*
2209 * this is a special callback for ->write_begin() only 2293 * this is a special callback for ->write_begin() only
2210 * it's intention is to return mapped block or reserve space 2294 * it's intention is to return mapped block or reserve space
2211 */ 2295 */
@@ -2244,51 +2328,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2244 2328
2245 return ret; 2329 return ret;
2246} 2330}
2247#define EXT4_DELALLOC_RSVED 1
2248static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
2249 struct buffer_head *bh_result, int create)
2250{
2251 int ret;
2252 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2253 loff_t disksize = EXT4_I(inode)->i_disksize;
2254 handle_t *handle = NULL;
2255
2256 handle = ext4_journal_current_handle();
2257 BUG_ON(!handle);
2258 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2259 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2260 if (ret > 0) {
2261
2262 bh_result->b_size = (ret << inode->i_blkbits);
2263
2264 if (ext4_should_order_data(inode)) {
2265 int retval;
2266 retval = ext4_jbd2_file_inode(handle, inode);
2267 if (retval)
2268 /*
2269 * Failed to add inode for ordered
2270 * mode. Don't update file size
2271 */
2272 return retval;
2273 }
2274
2275 /*
2276 * Update on-disk size along with block allocation
2277 * we don't use 'extend_disksize' as size may change
2278 * within already allocated block -bzzz
2279 */
2280 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2281 if (disksize > i_size_read(inode))
2282 disksize = i_size_read(inode);
2283 if (disksize > EXT4_I(inode)->i_disksize) {
2284 ext4_update_i_disksize(inode, disksize);
2285 ret = ext4_mark_inode_dirty(handle, inode);
2286 return ret;
2287 }
2288 ret = 0;
2289 }
2290 return ret;
2291}
2292 2331
2293static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) 2332static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
2294{ 2333{
@@ -2539,8 +2578,38 @@ retry:
2539 dump_stack(); 2578 dump_stack();
2540 goto out_writepages; 2579 goto out_writepages;
2541 } 2580 }
2542 mpd.get_block = ext4_da_get_block_write; 2581
2543 ret = mpage_da_writepages(mapping, wbc, &mpd); 2582 /*
2583 * Now call __mpage_da_writepage to find the next
2584 * contiguous region of logical blocks that need
2585 * blocks to be allocated by ext4. We don't actually
2586 * submit the blocks for I/O here, even though
2587 * write_cache_pages thinks it will, and will set the
2588 * pages as clean for write before calling
2589 * __mpage_da_writepage().
2590 */
2591 mpd.b_size = 0;
2592 mpd.b_state = 0;
2593 mpd.b_blocknr = 0;
2594 mpd.first_page = 0;
2595 mpd.next_page = 0;
2596 mpd.io_done = 0;
2597 mpd.pages_written = 0;
2598 mpd.retval = 0;
2599 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
2600 &mpd);
2601 /*
2602 * If we have a contigous extent of pages and we
2603 * haven't done the I/O yet, map the blocks and submit
2604 * them for I/O.
2605 */
2606 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
2607 if (mpage_da_map_blocks(&mpd) == 0)
2608 mpage_da_submit_io(&mpd);
2609 mpd.io_done = 1;
2610 ret = MPAGE_DA_EXTENT_TAIL;
2611 }
2612 wbc->nr_to_write -= mpd.pages_written;
2544 2613
2545 ext4_journal_stop(handle); 2614 ext4_journal_stop(handle);
2546 2615
@@ -2816,6 +2885,48 @@ out:
2816 return; 2885 return;
2817} 2886}
2818 2887
2888/*
2889 * Force all delayed allocation blocks to be allocated for a given inode.
2890 */
2891int ext4_alloc_da_blocks(struct inode *inode)
2892{
2893 if (!EXT4_I(inode)->i_reserved_data_blocks &&
2894 !EXT4_I(inode)->i_reserved_meta_blocks)
2895 return 0;
2896
2897 /*
2898 * We do something simple for now. The filemap_flush() will
2899 * also start triggering a write of the data blocks, which is
2900 * not strictly speaking necessary (and for users of
2901 * laptop_mode, not even desirable). However, to do otherwise
2902 * would require replicating code paths in:
2903 *
2904 * ext4_da_writepages() ->
2905 * write_cache_pages() ---> (via passed in callback function)
2906 * __mpage_da_writepage() -->
2907 * mpage_add_bh_to_extent()
2908 * mpage_da_map_blocks()
2909 *
2910 * The problem is that write_cache_pages(), located in
2911 * mm/page-writeback.c, marks pages clean in preparation for
2912 * doing I/O, which is not desirable if we're not planning on
2913 * doing I/O at all.
2914 *
2915 * We could call write_cache_pages(), and then redirty all of
2916 * the pages by calling redirty_page_for_writeback() but that
2917 * would be ugly in the extreme. So instead we would need to
2918 * replicate parts of the code in the above functions,
2919 * simplifying them becuase we wouldn't actually intend to
2920 * write out the pages, but rather only collect contiguous
2921 * logical block extents, call the multi-block allocator, and
2922 * then update the buffer heads with the block allocations.
2923 *
2924 * For now, though, we'll cheat by calling filemap_flush(),
2925 * which will map the blocks, and start the I/O, but not
2926 * actually wait for the I/O to complete.
2927 */
2928 return filemap_flush(inode->i_mapping);
2929}
2819 2930
2820/* 2931/*
2821 * bmap() is special. It gets used by applications such as lilo and by 2932 * bmap() is special. It gets used by applications such as lilo and by
@@ -3838,6 +3949,9 @@ void ext4_truncate(struct inode *inode)
3838 if (!ext4_can_truncate(inode)) 3949 if (!ext4_can_truncate(inode))
3839 return; 3950 return;
3840 3951
3952 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3953 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
3954
3841 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3955 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
3842 ext4_ext_truncate(inode); 3956 ext4_ext_truncate(inode);
3843 return; 3957 return;
@@ -4080,12 +4194,7 @@ make_io:
4080 unsigned num; 4194 unsigned num;
4081 4195
4082 table = ext4_inode_table(sb, gdp); 4196 table = ext4_inode_table(sb, gdp);
4083 /* Make sure s_inode_readahead_blks is a power of 2 */ 4197 /* s_inode_readahead_blks is always a power of 2 */
4084 while (EXT4_SB(sb)->s_inode_readahead_blks &
4085 (EXT4_SB(sb)->s_inode_readahead_blks-1))
4086 EXT4_SB(sb)->s_inode_readahead_blks =
4087 (EXT4_SB(sb)->s_inode_readahead_blks &
4088 (EXT4_SB(sb)->s_inode_readahead_blks-1));
4089 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 4198 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
4090 if (table > b) 4199 if (table > b)
4091 b = table; 4200 b = table;
@@ -4257,6 +4366,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4257 ei->i_disksize = inode->i_size; 4366 ei->i_disksize = inode->i_size;
4258 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4367 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4259 ei->i_block_group = iloc.block_group; 4368 ei->i_block_group = iloc.block_group;
4369 ei->i_last_alloc_group = ~0;
4260 /* 4370 /*
4261 * NOTE! The in-memory inode i_data array is in little-endian order 4371 * NOTE! The in-memory inode i_data array is in little-endian order
4262 * even on big-endian machines: we do NOT byteswap the block numbers! 4372 * even on big-endian machines: we do NOT byteswap the block numbers!
@@ -4299,6 +4409,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4299 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4409 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
4300 } 4410 }
4301 4411
4412 if (ei->i_flags & EXT4_EXTENTS_FL) {
4413 /* Validate extent which is part of inode */
4414 ret = ext4_ext_check_inode(inode);
4415 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4416 (S_ISLNK(inode->i_mode) &&
4417 !ext4_inode_is_fast_symlink(inode))) {
4418 /* Validate block references which are part of inode */
4419 ret = ext4_check_inode_blockref(inode);
4420 }
4421 if (ret) {
4422 brelse(bh);
4423 goto bad_inode;
4424 }
4425
4302 if (S_ISREG(inode->i_mode)) { 4426 if (S_ISREG(inode->i_mode)) {
4303 inode->i_op = &ext4_file_inode_operations; 4427 inode->i_op = &ext4_file_inode_operations;
4304 inode->i_fop = &ext4_file_operations; 4428 inode->i_fop = &ext4_file_operations;
@@ -4315,7 +4439,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4315 inode->i_op = &ext4_symlink_inode_operations; 4439 inode->i_op = &ext4_symlink_inode_operations;
4316 ext4_set_aops(inode); 4440 ext4_set_aops(inode);
4317 } 4441 }
4318 } else { 4442 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
4443 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
4319 inode->i_op = &ext4_special_inode_operations; 4444 inode->i_op = &ext4_special_inode_operations;
4320 if (raw_inode->i_block[0]) 4445 if (raw_inode->i_block[0])
4321 init_special_inode(inode, inode->i_mode, 4446 init_special_inode(inode, inode->i_mode,
@@ -4323,6 +4448,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4323 else 4448 else
4324 init_special_inode(inode, inode->i_mode, 4449 init_special_inode(inode, inode->i_mode,
4325 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4450 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4451 } else {
4452 brelse(bh);
4453 ret = -EIO;
4454 ext4_error(inode->i_sb, __func__,
4455 "bogus i_mode (%o) for inode=%lu",
4456 inode->i_mode, inode->i_ino);
4457 goto bad_inode;
4326 } 4458 }
4327 brelse(iloc.bh); 4459 brelse(iloc.bh);
4328 ext4_set_inode_flags(inode); 4460 ext4_set_inode_flags(inode);
@@ -4612,7 +4744,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4612 error = PTR_ERR(handle); 4744 error = PTR_ERR(handle);
4613 goto err_out; 4745 goto err_out;
4614 } 4746 }
4615 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; 4747 error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
4616 if (error) { 4748 if (error) {
4617 ext4_journal_stop(handle); 4749 ext4_journal_stop(handle);
4618 return error; 4750 return error;
@@ -4991,7 +5123,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4991 * i_size has been changed by generic_commit_write() and we thus need 5123 * i_size has been changed by generic_commit_write() and we thus need
4992 * to include the updated inode in the current transaction. 5124 * to include the updated inode in the current transaction.
4993 * 5125 *
4994 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks 5126 * Also, vfs_dq_alloc_block() will always dirty the inode when blocks
4995 * are allocated to the file. 5127 * are allocated to the file.
4996 * 5128 *
4997 * If the inode is marked synchronous, we don't honour that here - doing 5129 * If the inode is marked synchronous, we don't honour that here - doing
@@ -5116,8 +5248,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
5116 return !buffer_mapped(bh); 5248 return !buffer_mapped(bh);
5117} 5249}
5118 5250
5119int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) 5251int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5120{ 5252{
5253 struct page *page = vmf->page;
5121 loff_t size; 5254 loff_t size;
5122 unsigned long len; 5255 unsigned long len;
5123 int ret = -EINVAL; 5256 int ret = -EINVAL;
@@ -5169,6 +5302,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
5169 goto out_unlock; 5302 goto out_unlock;
5170 ret = 0; 5303 ret = 0;
5171out_unlock: 5304out_unlock:
5305 if (ret)
5306 ret = VM_FAULT_SIGBUS;
5172 up_read(&inode->i_alloc_sem); 5307 up_read(&inode->i_alloc_sem);
5173 return ret; 5308 return ret;
5174} 5309}