aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/balloc.c209
-rw-r--r--fs/ext4/dir.c17
-rw-r--r--fs/ext4/ext4.h61
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h10
-rw-r--r--fs/ext4/ext4_jbd2.h21
-rw-r--r--fs/ext4/ext4_sb.h5
-rw-r--r--fs/ext4/extents.c111
-rw-r--r--fs/ext4/file.c20
-rw-r--r--fs/ext4/fsync.c4
-rw-r--r--fs/ext4/group.h2
-rw-r--r--fs/ext4/ialloc.c113
-rw-r--r--fs/ext4/inode.c1591
-rw-r--r--fs/ext4/mballoc.c451
-rw-r--r--fs/ext4/namei.c45
-rw-r--r--fs/ext4/resize.c52
-rw-r--r--fs/ext4/super.c146
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/ext4/xattr_trusted.c4
-rw-r--r--fs/ext4/xattr_user.c4
20 files changed, 2321 insertions, 548 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9cc80b9cc8d8..495ab21b9832 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
47 ext4_group_t block_group) 47 ext4_group_t block_group)
48{ 48{
49 ext4_group_t actual_group; 49 ext4_group_t actual_group;
50 ext4_get_group_no_and_offset(sb, block, &actual_group, 0); 50 ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
51 if (actual_group == block_group) 51 if (actual_group == block_group)
52 return 1; 52 return 1;
53 return 0; 53 return 0;
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
121 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); 121 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
122 } 122 }
123 } else { /* For META_BG_BLOCK_GROUPS */ 123 } else { /* For META_BG_BLOCK_GROUPS */
124 int group_rel = (block_group - 124 bit_max += ext4_bg_num_gdb(sb, block_group);
125 le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
126 EXT4_DESC_PER_BLOCK(sb);
127 if (group_rel == 0 || group_rel == 1 ||
128 (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
129 bit_max += 1;
130 } 125 }
131 126
132 if (block_group == sbi->s_groups_count - 1) { 127 if (block_group == sbi->s_groups_count - 1) {
@@ -295,7 +290,7 @@ err_out:
295 return 0; 290 return 0;
296} 291}
297/** 292/**
298 * read_block_bitmap() 293 * ext4_read_block_bitmap()
299 * @sb: super block 294 * @sb: super block
300 * @block_group: given block group 295 * @block_group: given block group
301 * 296 *
@@ -305,7 +300,7 @@ err_out:
305 * Return buffer_head on success or NULL in case of failure. 300 * Return buffer_head on success or NULL in case of failure.
306 */ 301 */
307struct buffer_head * 302struct buffer_head *
308read_block_bitmap(struct super_block *sb, ext4_group_t block_group) 303ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
309{ 304{
310 struct ext4_group_desc * desc; 305 struct ext4_group_desc * desc;
311 struct buffer_head * bh = NULL; 306 struct buffer_head * bh = NULL;
@@ -409,8 +404,7 @@ restart:
409 prev = rsv; 404 prev = rsv;
410 } 405 }
411 printk("Window map complete.\n"); 406 printk("Window map complete.\n");
412 if (bad) 407 BUG_ON(bad);
413 BUG();
414} 408}
415#define rsv_window_dump(root, verbose) \ 409#define rsv_window_dump(root, verbose) \
416 __rsv_window_dump((root), (verbose), __func__) 410 __rsv_window_dump((root), (verbose), __func__)
@@ -694,7 +688,7 @@ do_more:
694 count -= overflow; 688 count -= overflow;
695 } 689 }
696 brelse(bitmap_bh); 690 brelse(bitmap_bh);
697 bitmap_bh = read_block_bitmap(sb, block_group); 691 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
698 if (!bitmap_bh) 692 if (!bitmap_bh)
699 goto error_return; 693 goto error_return;
700 desc = ext4_get_group_desc (sb, block_group, &gd_bh); 694 desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -810,6 +804,13 @@ do_more:
810 spin_unlock(sb_bgl_lock(sbi, block_group)); 804 spin_unlock(sb_bgl_lock(sbi, block_group));
811 percpu_counter_add(&sbi->s_freeblocks_counter, count); 805 percpu_counter_add(&sbi->s_freeblocks_counter, count);
812 806
807 if (sbi->s_log_groups_per_flex) {
808 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
809 spin_lock(sb_bgl_lock(sbi, flex_group));
810 sbi->s_flex_groups[flex_group].free_blocks += count;
811 spin_unlock(sb_bgl_lock(sbi, flex_group));
812 }
813
813 /* We dirtied the bitmap block */ 814 /* We dirtied the bitmap block */
814 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 815 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
815 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 816 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1598,23 +1599,35 @@ out:
1598 1599
1599/** 1600/**
1600 * ext4_has_free_blocks() 1601 * ext4_has_free_blocks()
1601 * @sbi: in-core super block structure. 1602 * @sbi: in-core super block structure.
1603 * @nblocks: number of neeed blocks
1602 * 1604 *
1603 * Check if filesystem has at least 1 free block available for allocation. 1605 * Check if filesystem has free blocks available for allocation.
1606 * Return the number of blocks avaible for allocation for this request
1607 * On success, return nblocks
1604 */ 1608 */
1605static int ext4_has_free_blocks(struct ext4_sb_info *sbi) 1609ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
1610 ext4_fsblk_t nblocks)
1606{ 1611{
1607 ext4_fsblk_t free_blocks, root_blocks; 1612 ext4_fsblk_t free_blocks;
1613 ext4_fsblk_t root_blocks = 0;
1608 1614
1609 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 1615 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1610 root_blocks = ext4_r_blocks_count(sbi->s_es); 1616
1611 if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && 1617 if (!capable(CAP_SYS_RESOURCE) &&
1612 sbi->s_resuid != current->fsuid && 1618 sbi->s_resuid != current->fsuid &&
1613 (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { 1619 (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
1614 return 0; 1620 root_blocks = ext4_r_blocks_count(sbi->s_es);
1615 } 1621#ifdef CONFIG_SMP
1616 return 1; 1622 if (free_blocks - root_blocks < FBC_BATCH)
1617} 1623 free_blocks =
1624 percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
1625#endif
1626 if (free_blocks - root_blocks < nblocks)
1627 return free_blocks - root_blocks;
1628 return nblocks;
1629 }
1630
1618 1631
1619/** 1632/**
1620 * ext4_should_retry_alloc() 1633 * ext4_should_retry_alloc()
@@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
1630 */ 1643 */
1631int ext4_should_retry_alloc(struct super_block *sb, int *retries) 1644int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1632{ 1645{
1633 if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3) 1646 if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
1634 return 0; 1647 return 0;
1635 1648
1636 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); 1649 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1639} 1652}
1640 1653
1641/** 1654/**
1642 * ext4_new_blocks_old() -- core block(s) allocation function 1655 * ext4_old_new_blocks() -- core block bitmap based block allocation function
1656 *
1643 * @handle: handle to this transaction 1657 * @handle: handle to this transaction
1644 * @inode: file inode 1658 * @inode: file inode
1645 * @goal: given target block(filesystem wide) 1659 * @goal: given target block(filesystem wide)
1646 * @count: target number of blocks to allocate 1660 * @count: target number of blocks to allocate
1647 * @errp: error code 1661 * @errp: error code
1648 * 1662 *
1649 * ext4_new_blocks uses a goal block to assist allocation. It tries to 1663 * ext4_old_new_blocks uses a goal block to assist allocation and look up
1650 * allocate block(s) from the block group contains the goal block first. If that 1664 * the block bitmap directly to do block allocation. It tries to
1651 * fails, it will try to allocate block(s) from other block groups without 1665 * allocate block(s) from the block group contains the goal block first. If
1652 * any specific goal block. 1666 * that fails, it will try to allocate block(s) from other block groups
1667 * without any specific goal block.
1668 *
1669 * This function is called when -o nomballoc mount option is enabled
1653 * 1670 *
1654 */ 1671 */
1655ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, 1672ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
1656 ext4_fsblk_t goal, unsigned long *count, int *errp) 1673 ext4_fsblk_t goal, unsigned long *count, int *errp)
1657{ 1674{
1658 struct buffer_head *bitmap_bh = NULL; 1675 struct buffer_head *bitmap_bh = NULL;
@@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
1676 ext4_group_t ngroups; 1693 ext4_group_t ngroups;
1677 unsigned long num = *count; 1694 unsigned long num = *count;
1678 1695
1679 *errp = -ENOSPC;
1680 sb = inode->i_sb; 1696 sb = inode->i_sb;
1681 if (!sb) { 1697 if (!sb) {
1698 *errp = -ENODEV;
1682 printk("ext4_new_block: nonexistent device"); 1699 printk("ext4_new_block: nonexistent device");
1683 return 0; 1700 return 0;
1684 } 1701 }
1685 1702
1703 sbi = EXT4_SB(sb);
1704 if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
1705 /*
1706 * With delalloc we already reserved the blocks
1707 */
1708 *count = ext4_has_free_blocks(sbi, *count);
1709 }
1710 if (*count == 0) {
1711 *errp = -ENOSPC;
1712 return 0; /*return with ENOSPC error */
1713 }
1714 num = *count;
1715
1686 /* 1716 /*
1687 * Check quota for allocation of this block. 1717 * Check quota for allocation of this block.
1688 */ 1718 */
@@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
1706 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) 1736 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1707 my_rsv = &block_i->rsv_window_node; 1737 my_rsv = &block_i->rsv_window_node;
1708 1738
1709 if (!ext4_has_free_blocks(sbi)) {
1710 *errp = -ENOSPC;
1711 goto out;
1712 }
1713
1714 /* 1739 /*
1715 * First, test whether the goal block is free. 1740 * First, test whether the goal block is free.
1716 */ 1741 */
@@ -1734,7 +1759,7 @@ retry_alloc:
1734 my_rsv = NULL; 1759 my_rsv = NULL;
1735 1760
1736 if (free_blocks > 0) { 1761 if (free_blocks > 0) {
1737 bitmap_bh = read_block_bitmap(sb, group_no); 1762 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1738 if (!bitmap_bh) 1763 if (!bitmap_bh)
1739 goto io_error; 1764 goto io_error;
1740 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle, 1765 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1770,7 +1795,7 @@ retry_alloc:
1770 continue; 1795 continue;
1771 1796
1772 brelse(bitmap_bh); 1797 brelse(bitmap_bh);
1773 bitmap_bh = read_block_bitmap(sb, group_no); 1798 bitmap_bh = ext4_read_block_bitmap(sb, group_no);
1774 if (!bitmap_bh) 1799 if (!bitmap_bh)
1775 goto io_error; 1800 goto io_error;
1776 /* 1801 /*
@@ -1882,7 +1907,15 @@ allocated:
1882 le16_add_cpu(&gdp->bg_free_blocks_count, -num); 1907 le16_add_cpu(&gdp->bg_free_blocks_count, -num);
1883 gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); 1908 gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
1884 spin_unlock(sb_bgl_lock(sbi, group_no)); 1909 spin_unlock(sb_bgl_lock(sbi, group_no));
1885 percpu_counter_sub(&sbi->s_freeblocks_counter, num); 1910 if (!EXT4_I(inode)->i_delalloc_reserved_flag)
1911 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1912
1913 if (sbi->s_log_groups_per_flex) {
1914 ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
1915 spin_lock(sb_bgl_lock(sbi, flex_group));
1916 sbi->s_flex_groups[flex_group].free_blocks -= num;
1917 spin_unlock(sb_bgl_lock(sbi, flex_group));
1918 }
1886 1919
1887 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); 1920 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1888 err = ext4_journal_dirty_metadata(handle, gdp_bh); 1921 err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1915,46 +1948,104 @@ out:
1915 return 0; 1948 return 0;
1916} 1949}
1917 1950
1918ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, 1951#define EXT4_META_BLOCK 0x1
1919 ext4_fsblk_t goal, int *errp) 1952
1953static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
1954 ext4_lblk_t iblock, ext4_fsblk_t goal,
1955 unsigned long *count, int *errp, int flags)
1920{ 1956{
1921 struct ext4_allocation_request ar; 1957 struct ext4_allocation_request ar;
1922 ext4_fsblk_t ret; 1958 ext4_fsblk_t ret;
1923 1959
1924 if (!test_opt(inode->i_sb, MBALLOC)) { 1960 if (!test_opt(inode->i_sb, MBALLOC)) {
1925 unsigned long count = 1; 1961 return ext4_old_new_blocks(handle, inode, goal, count, errp);
1926 ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
1927 return ret;
1928 } 1962 }
1929 1963
1930 memset(&ar, 0, sizeof(ar)); 1964 memset(&ar, 0, sizeof(ar));
1965 /* Fill with neighbour allocated blocks */
1966
1931 ar.inode = inode; 1967 ar.inode = inode;
1932 ar.goal = goal; 1968 ar.goal = goal;
1933 ar.len = 1; 1969 ar.len = *count;
1970 ar.logical = iblock;
1971
1972 if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
1973 /* enable in-core preallocation for data block allocation */
1974 ar.flags = EXT4_MB_HINT_DATA;
1975 else
1976 /* disable in-core preallocation for non-regular files */
1977 ar.flags = 0;
1978
1934 ret = ext4_mb_new_blocks(handle, &ar, errp); 1979 ret = ext4_mb_new_blocks(handle, &ar, errp);
1980 *count = ar.len;
1935 return ret; 1981 return ret;
1936} 1982}
1937 1983
1938ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, 1984/*
1985 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
1986 *
1987 * @handle: handle to this transaction
1988 * @inode: file inode
1989 * @goal: given target block(filesystem wide)
1990 * @count: total number of blocks need
1991 * @errp: error code
1992 *
1993 * Return 1st allocated block numberon success, *count stores total account
1994 * error stores in errp pointer
1995 */
1996ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1939 ext4_fsblk_t goal, unsigned long *count, int *errp) 1997 ext4_fsblk_t goal, unsigned long *count, int *errp)
1940{ 1998{
1941 struct ext4_allocation_request ar;
1942 ext4_fsblk_t ret; 1999 ext4_fsblk_t ret;
1943 2000 ret = do_blk_alloc(handle, inode, 0, goal,
1944 if (!test_opt(inode->i_sb, MBALLOC)) { 2001 count, errp, EXT4_META_BLOCK);
1945 ret = ext4_new_blocks_old(handle, inode, goal, count, errp); 2002 /*
1946 return ret; 2003 * Account for the allocated meta blocks
2004 */
2005 if (!(*errp)) {
2006 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2007 EXT4_I(inode)->i_allocated_meta_blocks += *count;
2008 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1947 } 2009 }
1948
1949 memset(&ar, 0, sizeof(ar));
1950 ar.inode = inode;
1951 ar.goal = goal;
1952 ar.len = *count;
1953 ret = ext4_mb_new_blocks(handle, &ar, errp);
1954 *count = ar.len;
1955 return ret; 2010 return ret;
1956} 2011}
1957 2012
2013/*
2014 * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
2015 *
2016 * @handle: handle to this transaction
2017 * @inode: file inode
2018 * @goal: given target block(filesystem wide)
2019 * @errp: error code
2020 *
2021 * Return allocated block number on success
2022 */
2023ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
2024 ext4_fsblk_t goal, int *errp)
2025{
2026 unsigned long count = 1;
2027 return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
2028}
2029
2030/*
2031 * ext4_new_blocks() -- allocate data blocks
2032 *
2033 * @handle: handle to this transaction
2034 * @inode: file inode
2035 * @goal: given target block(filesystem wide)
2036 * @count: total number of blocks need
2037 * @errp: error code
2038 *
2039 * Return 1st allocated block numberon success, *count stores total account
2040 * error stores in errp pointer
2041 */
2042
2043ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
2044 ext4_lblk_t iblock, ext4_fsblk_t goal,
2045 unsigned long *count, int *errp)
2046{
2047 return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
2048}
1958 2049
1959/** 2050/**
1960 * ext4_count_free_blocks() -- count filesystem free blocks 2051 * ext4_count_free_blocks() -- count filesystem free blocks
@@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
1986 continue; 2077 continue;
1987 desc_count += le16_to_cpu(gdp->bg_free_blocks_count); 2078 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1988 brelse(bitmap_bh); 2079 brelse(bitmap_bh);
1989 bitmap_bh = read_block_bitmap(sb, i); 2080 bitmap_bh = ext4_read_block_bitmap(sb, i);
1990 if (bitmap_bh == NULL) 2081 if (bitmap_bh == NULL)
1991 continue; 2082 continue;
1992 2083
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea194..d3d23d73c08b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
129 struct buffer_head *bh = NULL; 129 struct buffer_head *bh = NULL;
130 130
131 map_bh.b_state = 0; 131 map_bh.b_state = 0;
132 err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0); 132 err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
133 0, 0, 0);
133 if (err > 0) { 134 if (err > 0) {
134 pgoff_t index = map_bh.b_blocknr >> 135 pgoff_t index = map_bh.b_blocknr >>
135 (PAGE_CACHE_SHIFT - inode->i_blkbits); 136 (PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
272 273
273 while (n) { 274 while (n) {
274 /* Do the node's children first */ 275 /* Do the node's children first */
275 if ((n)->rb_left) { 276 if (n->rb_left) {
276 n = n->rb_left; 277 n = n->rb_left;
277 continue; 278 continue;
278 } 279 }
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
301 parent->rb_right = NULL; 302 parent->rb_right = NULL;
302 n = parent; 303 n = parent;
303 } 304 }
304 root->rb_node = NULL;
305} 305}
306 306
307 307
308static struct dir_private_info *create_dir_info(loff_t pos) 308static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
309{ 309{
310 struct dir_private_info *p; 310 struct dir_private_info *p;
311 311
312 p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); 312 p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
313 if (!p) 313 if (!p)
314 return NULL; 314 return NULL;
315 p->root.rb_node = NULL;
316 p->curr_node = NULL;
317 p->extra_fname = NULL;
318 p->last_pos = 0;
319 p->curr_hash = pos2maj_hash(pos); 315 p->curr_hash = pos2maj_hash(pos);
320 p->curr_minor_hash = pos2min_hash(pos); 316 p->curr_minor_hash = pos2min_hash(pos);
321 p->next_hash = 0;
322 return p; 317 return p;
323} 318}
324 319
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
433 int ret; 428 int ret;
434 429
435 if (!info) { 430 if (!info) {
436 info = create_dir_info(filp->f_pos); 431 info = ext4_htree_create_dir_info(filp->f_pos);
437 if (!info) 432 if (!info)
438 return -ENOMEM; 433 return -ENOMEM;
439 filp->private_data = info; 434 filp->private_data = info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac0..303e41cf7b14 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
22#include "ext4_i.h" 22#include "ext4_i.h"
23 23
24/* 24/*
25 * The second extended filesystem constants/structures 25 * The fourth extended filesystem constants/structures
26 */ 26 */
27 27
28/* 28/*
@@ -45,7 +45,7 @@
45#define ext4_debug(f, a...) \ 45#define ext4_debug(f, a...) \
46 do { \ 46 do { \
47 printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ 47 printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
48 __FILE__, __LINE__, __FUNCTION__); \ 48 __FILE__, __LINE__, __func__); \
49 printk (KERN_DEBUG f, ## a); \ 49 printk (KERN_DEBUG f, ## a); \
50 } while (0) 50 } while (0)
51#else 51#else
@@ -74,6 +74,9 @@
74#define EXT4_MB_HINT_GOAL_ONLY 256 74#define EXT4_MB_HINT_GOAL_ONLY 256
75/* goal is meaningful */ 75/* goal is meaningful */
76#define EXT4_MB_HINT_TRY_GOAL 512 76#define EXT4_MB_HINT_TRY_GOAL 512
77/* blocks already pre-reserved by delayed allocation */
78#define EXT4_MB_DELALLOC_RESERVED 1024
79
77 80
78struct ext4_allocation_request { 81struct ext4_allocation_request {
79 /* target inode for block we're allocating */ 82 /* target inode for block we're allocating */
@@ -170,6 +173,15 @@ struct ext4_group_desc
170 __u32 bg_reserved2[3]; 173 __u32 bg_reserved2[3];
171}; 174};
172 175
176/*
177 * Structure of a flex block group info
178 */
179
180struct flex_groups {
181 __u32 free_inodes;
182 __u32 free_blocks;
183};
184
173#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ 185#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
174#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ 186#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
175#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ 187#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
@@ -527,6 +539,7 @@ do { \
527#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 539#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
528#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 540#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
529#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ 541#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
542#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
530/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ 543/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
531#ifndef _LINUX_EXT2_FS_H 544#ifndef _LINUX_EXT2_FS_H
532#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 545#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@@ -647,7 +660,10 @@ struct ext4_super_block {
647 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ 660 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
648 __le64 s_mmp_block; /* Block for multi-mount protection */ 661 __le64 s_mmp_block; /* Block for multi-mount protection */
649 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 662 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
650 __u32 s_reserved[163]; /* Padding to the end of the block */ 663 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
664 __u8 s_reserved_char_pad2;
665 __le16 s_reserved_pad;
666 __u32 s_reserved[162]; /* Padding to the end of the block */
651}; 667};
652 668
653#ifdef __KERNEL__ 669#ifdef __KERNEL__
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
958extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); 974extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
959extern unsigned long ext4_bg_num_gdb(struct super_block *sb, 975extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
960 ext4_group_t group); 976 ext4_group_t group);
961extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode, 977extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
962 ext4_fsblk_t goal, int *errp); 978 ext4_fsblk_t goal, int *errp);
963extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode, 979extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
964 ext4_fsblk_t goal, unsigned long *count, int *errp); 980 ext4_fsblk_t goal, unsigned long *count, int *errp);
965extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, 981extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
982 ext4_lblk_t iblock, ext4_fsblk_t goal,
983 unsigned long *count, int *errp);
984extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
966 ext4_fsblk_t goal, unsigned long *count, int *errp); 985 ext4_fsblk_t goal, unsigned long *count, int *errp);
986extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
987 ext4_fsblk_t nblocks);
967extern void ext4_free_blocks (handle_t *handle, struct inode *inode, 988extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
968 ext4_fsblk_t block, unsigned long count, int metadata); 989 ext4_fsblk_t block, unsigned long count, int metadata);
969extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, 990extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
@@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void);
1016extern void exit_ext4_mballoc(void); 1037extern void exit_ext4_mballoc(void);
1017extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1038extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1018 unsigned long, unsigned long, int, unsigned long *); 1039 unsigned long, unsigned long, int, unsigned long *);
1040extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
1041 ext4_group_t i, struct ext4_group_desc *desc);
1042extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1043 ext4_grpblk_t add);
1019 1044
1020 1045
1021/* inode.c */ 1046/* inode.c */
1047void ext4_da_release_space(struct inode *inode, int used, int to_free);
1022int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 1048int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
1023 struct buffer_head *bh, ext4_fsblk_t blocknr); 1049 struct buffer_head *bh, ext4_fsblk_t blocknr);
1024struct buffer_head *ext4_getblk(handle_t *, struct inode *, 1050struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
1033extern struct inode *ext4_iget(struct super_block *, unsigned long); 1059extern struct inode *ext4_iget(struct super_block *, unsigned long);
1034extern int ext4_write_inode (struct inode *, int); 1060extern int ext4_write_inode (struct inode *, int);
1035extern int ext4_setattr (struct dentry *, struct iattr *); 1061extern int ext4_setattr (struct dentry *, struct iattr *);
1062extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1063 struct kstat *stat);
1036extern void ext4_delete_inode (struct inode *); 1064extern void ext4_delete_inode (struct inode *);
1037extern int ext4_sync_inode (handle_t *, struct inode *); 1065extern int ext4_sync_inode (handle_t *, struct inode *);
1038extern void ext4_discard_reservation (struct inode *); 1066extern void ext4_discard_reservation (struct inode *);
1039extern void ext4_dirty_inode(struct inode *); 1067extern void ext4_dirty_inode(struct inode *);
1040extern int ext4_change_inode_journal_flag(struct inode *, int); 1068extern int ext4_change_inode_journal_flag(struct inode *, int);
1041extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1069extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1070extern int ext4_can_truncate(struct inode *inode);
1042extern void ext4_truncate (struct inode *); 1071extern void ext4_truncate (struct inode *);
1043extern void ext4_set_inode_flags(struct inode *); 1072extern void ext4_set_inode_flags(struct inode *);
1044extern void ext4_get_inode_flags(struct ext4_inode_info *); 1073extern void ext4_get_inode_flags(struct ext4_inode_info *);
1045extern void ext4_set_aops(struct inode *inode); 1074extern void ext4_set_aops(struct inode *inode);
1046extern int ext4_writepage_trans_blocks(struct inode *); 1075extern int ext4_writepage_trans_blocks(struct inode *);
1047extern int ext4_block_truncate_page(handle_t *handle, struct page *page, 1076extern int ext4_block_truncate_page(handle_t *handle,
1048 struct address_space *mapping, loff_t from); 1077 struct address_space *mapping, loff_t from);
1078extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
1049 1079
1050/* ioctl.c */ 1080/* ioctl.c */
1051extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1081extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
1159} 1189}
1160 1190
1161 1191
1192static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
1193 ext4_group_t block_group)
1194{
1195 return block_group >> sbi->s_log_groups_per_flex;
1196}
1197
1198static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
1199{
1200 return 1 << sbi->s_log_groups_per_flex;
1201}
1202
1162#define ext4_std_error(sb, errno) \ 1203#define ext4_std_error(sb, errno) \
1163do { \ 1204do { \
1164 if ((errno)) \ 1205 if ((errno)) \
1165 __ext4_std_error((sb), __FUNCTION__, (errno)); \ 1206 __ext4_std_error((sb), __func__, (errno)); \
1166} while (0) 1207} while (0)
1167 1208
1168/* 1209/*
@@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1191 ext4_lblk_t iblock, 1232 ext4_lblk_t iblock,
1192 unsigned long max_blocks, struct buffer_head *bh_result, 1233 unsigned long max_blocks, struct buffer_head *bh_result,
1193 int create, int extend_disksize); 1234 int create, int extend_disksize);
1194extern void ext4_ext_truncate(struct inode *, struct page *); 1235extern void ext4_ext_truncate(struct inode *);
1195extern void ext4_ext_init(struct super_block *); 1236extern void ext4_ext_init(struct super_block *);
1196extern void ext4_ext_release(struct super_block *); 1237extern void ext4_ext_release(struct super_block *);
1197extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1238extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1199extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, 1240extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
1200 sector_t block, unsigned long max_blocks, 1241 sector_t block, unsigned long max_blocks,
1201 struct buffer_head *bh, int create, 1242 struct buffer_head *bh, int create,
1202 int extend_disksize); 1243 int extend_disksize, int flag);
1203#endif /* __KERNEL__ */ 1244#endif /* __KERNEL__ */
1204 1245
1205#endif /* _EXT4_H */ 1246#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fab..6c166c0a54b7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
212 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); 212 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
213} 213}
214 214
215extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
215extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 216extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
216extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); 217extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
217extern int ext4_extent_tree_init(handle_t *, struct inode *); 218extern int ext4_extent_tree_init(handle_t *, struct inode *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d79..ef7409f0e7e4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
79}; 79};
80 80
81/* 81/*
82 * third extended file system inode data in memory 82 * fourth extended file system inode data in memory
83 */ 83 */
84struct ext4_inode_info { 84struct ext4_inode_info {
85 __le32 i_data[15]; /* unconverted */ 85 __le32 i_data[15]; /* unconverted */
@@ -150,6 +150,7 @@ struct ext4_inode_info {
150 */ 150 */
151 struct rw_semaphore i_data_sem; 151 struct rw_semaphore i_data_sem;
152 struct inode vfs_inode; 152 struct inode vfs_inode;
153 struct jbd2_inode jinode;
153 154
154 unsigned long i_ext_generation; 155 unsigned long i_ext_generation;
155 struct ext4_ext_cache i_cached_extent; 156 struct ext4_ext_cache i_cached_extent;
@@ -162,6 +163,13 @@ struct ext4_inode_info {
162 /* mballoc */ 163 /* mballoc */
163 struct list_head i_prealloc_list; 164 struct list_head i_prealloc_list;
164 spinlock_t i_prealloc_lock; 165 spinlock_t i_prealloc_lock;
166
167 /* allocation reservation info for delalloc */
168 unsigned long i_reserved_data_blocks;
169 unsigned long i_reserved_meta_blocks;
170 unsigned long i_allocated_meta_blocks;
171 unsigned short i_delalloc_reserved_flag;
172 spinlock_t i_block_reservation_lock;
165}; 173};
166 174
167#endif /* _EXT4_I */ 175#endif /* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b24..eb8bc3afe6e9 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where,
142 handle_t *handle, struct buffer_head *bh); 142 handle_t *handle, struct buffer_head *bh);
143 143
144#define ext4_journal_get_undo_access(handle, bh) \ 144#define ext4_journal_get_undo_access(handle, bh) \
145 __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh)) 145 __ext4_journal_get_undo_access(__func__, (handle), (bh))
146#define ext4_journal_get_write_access(handle, bh) \ 146#define ext4_journal_get_write_access(handle, bh) \
147 __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh)) 147 __ext4_journal_get_write_access(__func__, (handle), (bh))
148#define ext4_journal_revoke(handle, blocknr, bh) \ 148#define ext4_journal_revoke(handle, blocknr, bh) \
149 __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh)) 149 __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
150#define ext4_journal_get_create_access(handle, bh) \ 150#define ext4_journal_get_create_access(handle, bh) \
151 __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh)) 151 __ext4_journal_get_create_access(__func__, (handle), (bh))
152#define ext4_journal_dirty_metadata(handle, bh) \ 152#define ext4_journal_dirty_metadata(handle, bh) \
153 __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh)) 153 __ext4_journal_dirty_metadata(__func__, (handle), (bh))
154#define ext4_journal_forget(handle, bh) \ 154#define ext4_journal_forget(handle, bh) \
155 __ext4_journal_forget(__FUNCTION__, (handle), (bh)) 155 __ext4_journal_forget(__func__, (handle), (bh))
156
157int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
158 156
159handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 157handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
160int __ext4_journal_stop(const char *where, handle_t *handle); 158int __ext4_journal_stop(const char *where, handle_t *handle);
@@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
165} 163}
166 164
167#define ext4_journal_stop(handle) \ 165#define ext4_journal_stop(handle) \
168 __ext4_journal_stop(__FUNCTION__, (handle)) 166 __ext4_journal_stop(__func__, (handle))
169 167
170static inline handle_t *ext4_journal_current_handle(void) 168static inline handle_t *ext4_journal_current_handle(void)
171{ 169{
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
192 return jbd2_journal_force_commit(journal); 190 return jbd2_journal_force_commit(journal);
193} 191}
194 192
193static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
194{
195 return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
196}
197
195/* super.c */ 198/* super.c */
196int ext4_force_commit(struct super_block *sb); 199int ext4_force_commit(struct super_block *sb);
197 200
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f2191..6300226d5531 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
25#include <linux/rbtree.h> 25#include <linux/rbtree.h>
26 26
27/* 27/*
28 * third extended-fs super-block data in memory 28 * fourth extended-fs super-block data in memory
29 */ 29 */
30struct ext4_sb_info { 30struct ext4_sb_info {
31 unsigned long s_desc_size; /* Size of a group descriptor in bytes */ 31 unsigned long s_desc_size; /* Size of a group descriptor in bytes */
@@ -143,6 +143,9 @@ struct ext4_sb_info {
143 143
144 /* locality groups */ 144 /* locality groups */
145 struct ext4_locality_group *s_locality_groups; 145 struct ext4_locality_group *s_locality_groups;
146
147 unsigned int s_log_groups_per_flex;
148 struct flex_groups *s_flex_groups;
146}; 149};
147 150
148#endif /* _EXT4_SB */ 151#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3dae..42c4c0c892ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
92 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 92 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
93} 93}
94 94
95static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed) 95static int ext4_ext_journal_restart(handle_t *handle, int needed)
96{ 96{
97 int err; 97 int err;
98 98
99 if (handle->h_buffer_credits > needed) 99 if (handle->h_buffer_credits > needed)
100 return handle; 100 return 0;
101 if (!ext4_journal_extend(handle, needed)) 101 err = ext4_journal_extend(handle, needed);
102 return handle; 102 if (err)
103 err = ext4_journal_restart(handle, needed); 103 return err;
104 104 return ext4_journal_restart(handle, needed);
105 return handle;
106} 105}
107 106
108/* 107/*
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
180 return bg_start + colour + block; 179 return bg_start + colour + block;
181} 180}
182 181
182/*
183 * Allocation for a meta data block
184 */
183static ext4_fsblk_t 185static ext4_fsblk_t
184ext4_ext_new_block(handle_t *handle, struct inode *inode, 186ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
185 struct ext4_ext_path *path, 187 struct ext4_ext_path *path,
186 struct ext4_extent *ex, int *err) 188 struct ext4_extent *ex, int *err)
187{ 189{
188 ext4_fsblk_t goal, newblock; 190 ext4_fsblk_t goal, newblock;
189 191
190 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 192 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
191 newblock = ext4_new_block(handle, inode, goal, err); 193 newblock = ext4_new_meta_block(handle, inode, goal, err);
192 return newblock; 194 return newblock;
193} 195}
194 196
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
246 return size; 248 return size;
247} 249}
248 250
251/*
252 * Calculate the number of metadata blocks needed
253 * to allocate @blocks
254 * Worse case is one block per extent
255 */
256int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
257{
258 int lcap, icap, rcap, leafs, idxs, num;
259 int newextents = blocks;
260
261 rcap = ext4_ext_space_root_idx(inode);
262 lcap = ext4_ext_space_block(inode);
263 icap = ext4_ext_space_block_idx(inode);
264
265 /* number of new leaf blocks needed */
266 num = leafs = (newextents + lcap - 1) / lcap;
267
268 /*
269 * Worse case, we need separate index block(s)
270 * to link all new leaf blocks
271 */
272 idxs = (leafs + icap - 1) / icap;
273 do {
274 num += idxs;
275 idxs = (idxs + icap - 1) / icap;
276 } while (idxs > rcap);
277
278 return num;
279}
280
249static int 281static int
250ext4_ext_max_entries(struct inode *inode, int depth) 282ext4_ext_max_entries(struct inode *inode, int depth)
251{ 283{
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
524 alloc = 1; 556 alloc = 1;
525 } 557 }
526 path[0].p_hdr = eh; 558 path[0].p_hdr = eh;
559 path[0].p_bh = NULL;
527 560
528 i = depth; 561 i = depth;
529 /* walk through the tree */ 562 /* walk through the tree */
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
552 } 585 }
553 586
554 path[ppos].p_depth = i; 587 path[ppos].p_depth = i;
555 path[ppos].p_hdr = eh;
556 path[ppos].p_ext = NULL; 588 path[ppos].p_ext = NULL;
557 path[ppos].p_idx = NULL; 589 path[ppos].p_idx = NULL;
558 590
559 /* find extent */ 591 /* find extent */
560 ext4_ext_binsearch(inode, path + ppos, block); 592 ext4_ext_binsearch(inode, path + ppos, block);
593 /* if not an empty leaf */
594 if (path[ppos].p_ext)
595 path[ppos].p_block = ext_pblock(path[ppos].p_ext);
561 596
562 ext4_ext_show_path(inode, path); 597 ext4_ext_show_path(inode, path);
563 598
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
688 /* allocate all needed blocks */ 723 /* allocate all needed blocks */
689 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 724 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
690 for (a = 0; a < depth - at; a++) { 725 for (a = 0; a < depth - at; a++) {
691 newblock = ext4_ext_new_block(handle, inode, path, newext, &err); 726 newblock = ext4_ext_new_meta_block(handle, inode, path,
727 newext, &err);
692 if (newblock == 0) 728 if (newblock == 0)
693 goto cleanup; 729 goto cleanup;
694 ablocks[a] = newblock; 730 ablocks[a] = newblock;
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
884 ext4_fsblk_t newblock; 920 ext4_fsblk_t newblock;
885 int err = 0; 921 int err = 0;
886 922
887 newblock = ext4_ext_new_block(handle, inode, path, newext, &err); 923 newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
888 if (newblock == 0) 924 if (newblock == 0)
889 return err; 925 return err;
890 926
@@ -981,6 +1017,8 @@ repeat:
981 /* if we found index with free entry, then use that 1017 /* if we found index with free entry, then use that
982 * entry: create all needed subtree and add new leaf */ 1018 * entry: create all needed subtree and add new leaf */
983 err = ext4_ext_split(handle, inode, path, newext, i); 1019 err = ext4_ext_split(handle, inode, path, newext, i);
1020 if (err)
1021 goto out;
984 1022
985 /* refill path */ 1023 /* refill path */
986 ext4_ext_drop_refs(path); 1024 ext4_ext_drop_refs(path);
@@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1883 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 1921 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1884#endif 1922#endif
1885 1923
1886 handle = ext4_ext_journal_restart(handle, credits); 1924 err = ext4_ext_journal_restart(handle, credits);
1887 if (IS_ERR(handle)) { 1925 if (err)
1888 err = PTR_ERR(handle);
1889 goto out; 1926 goto out;
1890 }
1891 1927
1892 err = ext4_ext_get_access(handle, inode, path + depth); 1928 err = ext4_ext_get_access(handle, inode, path + depth);
1893 if (err) 1929 if (err)
@@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2529 int err = 0, depth, ret; 2565 int err = 0, depth, ret;
2530 unsigned long allocated = 0; 2566 unsigned long allocated = 0;
2531 struct ext4_allocation_request ar; 2567 struct ext4_allocation_request ar;
2568 loff_t disksize;
2532 2569
2533 __clear_bit(BH_New, &bh_result->b_state); 2570 __clear_bit(BH_New, &bh_result->b_state);
2534 ext_debug("blocks %u/%lu requested for inode %u\n", 2571 ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2616 */ 2653 */
2617 if (allocated > max_blocks) 2654 if (allocated > max_blocks)
2618 allocated = max_blocks; 2655 allocated = max_blocks;
2619 /* mark the buffer unwritten */ 2656 set_buffer_unwritten(bh_result);
2620 __set_bit(BH_Unwritten, &bh_result->b_state);
2621 goto out2; 2657 goto out2;
2622 } 2658 }
2623 2659
@@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2716 goto out2; 2752 goto out2;
2717 } 2753 }
2718 2754
2719 if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
2720 EXT4_I(inode)->i_disksize = inode->i_size;
2721
2722 /* previous routine could use block we allocated */ 2755 /* previous routine could use block we allocated */
2723 newblock = ext_pblock(&newex); 2756 newblock = ext_pblock(&newex);
2724 allocated = ext4_ext_get_actual_len(&newex); 2757 allocated = ext4_ext_get_actual_len(&newex);
2725outnew: 2758outnew:
2726 __set_bit(BH_New, &bh_result->b_state); 2759 if (extend_disksize) {
2760 disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
2761 if (disksize > i_size_read(inode))
2762 disksize = i_size_read(inode);
2763 if (disksize > EXT4_I(inode)->i_disksize)
2764 EXT4_I(inode)->i_disksize = disksize;
2765 }
2766
2767 set_buffer_new(bh_result);
2727 2768
2728 /* Cache only when it is _not_ an uninitialized extent */ 2769 /* Cache only when it is _not_ an uninitialized extent */
2729 if (create != EXT4_CREATE_UNINITIALIZED_EXT) 2770 if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2774,7 @@ out:
2733 if (allocated > max_blocks) 2774 if (allocated > max_blocks)
2734 allocated = max_blocks; 2775 allocated = max_blocks;
2735 ext4_ext_show_leaf(inode, path); 2776 ext4_ext_show_leaf(inode, path);
2736 __set_bit(BH_Mapped, &bh_result->b_state); 2777 set_buffer_mapped(bh_result);
2737 bh_result->b_bdev = inode->i_sb->s_bdev; 2778 bh_result->b_bdev = inode->i_sb->s_bdev;
2738 bh_result->b_blocknr = newblock; 2779 bh_result->b_blocknr = newblock;
2739out2: 2780out2:
@@ -2744,7 +2785,7 @@ out2:
2744 return err ? err : allocated; 2785 return err ? err : allocated;
2745} 2786}
2746 2787
2747void ext4_ext_truncate(struct inode * inode, struct page *page) 2788void ext4_ext_truncate(struct inode *inode)
2748{ 2789{
2749 struct address_space *mapping = inode->i_mapping; 2790 struct address_space *mapping = inode->i_mapping;
2750 struct super_block *sb = inode->i_sb; 2791 struct super_block *sb = inode->i_sb;
@@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2757 */ 2798 */
2758 err = ext4_writepage_trans_blocks(inode) + 3; 2799 err = ext4_writepage_trans_blocks(inode) + 3;
2759 handle = ext4_journal_start(inode, err); 2800 handle = ext4_journal_start(inode, err);
2760 if (IS_ERR(handle)) { 2801 if (IS_ERR(handle))
2761 if (page) {
2762 clear_highpage(page);
2763 flush_dcache_page(page);
2764 unlock_page(page);
2765 page_cache_release(page);
2766 }
2767 return; 2802 return;
2768 }
2769 2803
2770 if (page) 2804 if (inode->i_size & (sb->s_blocksize - 1))
2771 ext4_block_truncate_page(handle, page, mapping, inode->i_size); 2805 ext4_block_truncate_page(handle, mapping, inode->i_size);
2806
2807 if (ext4_orphan_add(handle, inode))
2808 goto out_stop;
2772 2809
2773 down_write(&EXT4_I(inode)->i_data_sem); 2810 down_write(&EXT4_I(inode)->i_data_sem);
2774 ext4_ext_invalidate_cache(inode); 2811 ext4_ext_invalidate_cache(inode);
@@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2780 * Probably we need not scan at all, 2817 * Probably we need not scan at all,
2781 * because page truncation is enough. 2818 * because page truncation is enough.
2782 */ 2819 */
2783 if (ext4_orphan_add(handle, inode))
2784 goto out_stop;
2785 2820
2786 /* we have to know where to truncate from in crash case */ 2821 /* we have to know where to truncate from in crash case */
2787 EXT4_I(inode)->i_disksize = inode->i_size; 2822 EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2798 handle->h_sync = 1; 2833 handle->h_sync = 1;
2799 2834
2800out_stop: 2835out_stop:
2836 up_write(&EXT4_I(inode)->i_data_sem);
2801 /* 2837 /*
2802 * If this was a simple ftruncate() and the file will remain alive, 2838 * If this was a simple ftruncate() and the file will remain alive,
2803 * then we need to clear up the orphan record which we created above. 2839 * then we need to clear up the orphan record which we created above.
@@ -2808,7 +2844,6 @@ out_stop:
2808 if (inode->i_nlink) 2844 if (inode->i_nlink)
2809 ext4_orphan_del(handle, inode); 2845 ext4_orphan_del(handle, inode);
2810 2846
2811 up_write(&EXT4_I(inode)->i_data_sem);
2812 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 2847 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
2813 ext4_mark_inode_dirty(handle, inode); 2848 ext4_mark_inode_dirty(handle, inode);
2814 ext4_journal_stop(handle); 2849 ext4_journal_stop(handle);
@@ -2911,7 +2946,7 @@ retry:
2911 } 2946 }
2912 ret = ext4_get_blocks_wrap(handle, inode, block, 2947 ret = ext4_get_blocks_wrap(handle, inode, block,
2913 max_blocks, &map_bh, 2948 max_blocks, &map_bh,
2914 EXT4_CREATE_UNINITIALIZED_EXT, 0); 2949 EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
2915 if (ret <= 0) { 2950 if (ret <= 0) {
2916#ifdef EXT4FS_DEBUG 2951#ifdef EXT4FS_DEBUG
2917 WARN_ON(ret <= 0); 2952 WARN_ON(ret <= 0);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366ab..430eb7978db4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ force_commit:
123 return ret; 123 return ret;
124} 124}
125 125
126static struct vm_operations_struct ext4_file_vm_ops = {
127 .fault = filemap_fault,
128 .page_mkwrite = ext4_page_mkwrite,
129};
130
131static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
132{
133 struct address_space *mapping = file->f_mapping;
134
135 if (!mapping->a_ops->readpage)
136 return -ENOEXEC;
137 file_accessed(file);
138 vma->vm_ops = &ext4_file_vm_ops;
139 vma->vm_flags |= VM_CAN_NONLINEAR;
140 return 0;
141}
142
126const struct file_operations ext4_file_operations = { 143const struct file_operations ext4_file_operations = {
127 .llseek = generic_file_llseek, 144 .llseek = generic_file_llseek,
128 .read = do_sync_read, 145 .read = do_sync_read,
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
133#ifdef CONFIG_COMPAT 150#ifdef CONFIG_COMPAT
134 .compat_ioctl = ext4_compat_ioctl, 151 .compat_ioctl = ext4_compat_ioctl,
135#endif 152#endif
136 .mmap = generic_file_mmap, 153 .mmap = ext4_file_mmap,
137 .open = generic_file_open, 154 .open = generic_file_open,
138 .release = ext4_release_file, 155 .release = ext4_release_file,
139 .fsync = ext4_sync_file, 156 .fsync = ext4_sync_file,
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
144const struct inode_operations ext4_file_inode_operations = { 161const struct inode_operations ext4_file_inode_operations = {
145 .truncate = ext4_truncate, 162 .truncate = ext4_truncate,
146 .setattr = ext4_setattr, 163 .setattr = ext4_setattr,
164 .getattr = ext4_getattr,
147#ifdef CONFIG_EXT4DEV_FS_XATTR 165#ifdef CONFIG_EXT4DEV_FS_XATTR
148 .setxattr = generic_setxattr, 166 .setxattr = generic_setxattr,
149 .getxattr = generic_getxattr, 167 .getxattr = generic_getxattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8d..a45c3737ad31 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,7 @@
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/jbd2.h> 29#include <linux/jbd2.h>
30#include <linux/blkdev.h>
30#include "ext4.h" 31#include "ext4.h"
31#include "ext4_jbd2.h" 32#include "ext4_jbd2.h"
32 33
@@ -45,6 +46,7 @@
45int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) 46int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
46{ 47{
47 struct inode *inode = dentry->d_inode; 48 struct inode *inode = dentry->d_inode;
49 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
48 int ret = 0; 50 int ret = 0;
49 51
50 J_ASSERT(ext4_journal_current_handle() == NULL); 52 J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
85 .nr_to_write = 0, /* sys_fsync did this */ 87 .nr_to_write = 0, /* sys_fsync did this */
86 }; 88 };
87 ret = sync_inode(inode, &wbc); 89 ret = sync_inode(inode, &wbc);
90 if (journal && (journal->j_flags & JBD2_BARRIER))
91 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
88 } 92 }
89out: 93out:
90 return ret; 94 return ret;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7eea..c2c0a8d06d0e 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
13 struct ext4_group_desc *gdp); 13 struct ext4_group_desc *gdp);
14extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, 14extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
15 struct ext4_group_desc *gdp); 15 struct ext4_group_desc *gdp);
16struct buffer_head *read_block_bitmap(struct super_block *sb, 16struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
17 ext4_group_t block_group); 17 ext4_group_t block_group);
18extern unsigned ext4_init_block_bitmap(struct super_block *sb, 18extern unsigned ext4_init_block_bitmap(struct super_block *sb,
19 struct buffer_head *bh, 19 struct buffer_head *bh,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c801..a92eb305344f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
157 struct ext4_super_block * es; 157 struct ext4_super_block * es;
158 struct ext4_sb_info *sbi; 158 struct ext4_sb_info *sbi;
159 int fatal = 0, err; 159 int fatal = 0, err;
160 ext4_group_t flex_group;
160 161
161 if (atomic_read(&inode->i_count) > 1) { 162 if (atomic_read(&inode->i_count) > 1) {
162 printk ("ext4_free_inode: inode has count=%d\n", 163 printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
232 if (is_directory) 233 if (is_directory)
233 percpu_counter_dec(&sbi->s_dirs_counter); 234 percpu_counter_dec(&sbi->s_dirs_counter);
234 235
236 if (sbi->s_log_groups_per_flex) {
237 flex_group = ext4_flex_group(sbi, block_group);
238 spin_lock(sb_bgl_lock(sbi, flex_group));
239 sbi->s_flex_groups[flex_group].free_inodes++;
240 spin_unlock(sb_bgl_lock(sbi, flex_group));
241 }
235 } 242 }
236 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); 243 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
237 err = ext4_journal_dirty_metadata(handle, bh2); 244 err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
286 return ret; 293 return ret;
287} 294}
288 295
296#define free_block_ratio 10
297
298static int find_group_flex(struct super_block *sb, struct inode *parent,
299 ext4_group_t *best_group)
300{
301 struct ext4_sb_info *sbi = EXT4_SB(sb);
302 struct ext4_group_desc *desc;
303 struct buffer_head *bh;
304 struct flex_groups *flex_group = sbi->s_flex_groups;
305 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
306 ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
307 ext4_group_t ngroups = sbi->s_groups_count;
308 int flex_size = ext4_flex_bg_size(sbi);
309 ext4_group_t best_flex = parent_fbg_group;
310 int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
311 int flexbg_free_blocks;
312 int flex_freeb_ratio;
313 ext4_group_t n_fbg_groups;
314 ext4_group_t i;
315
316 n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
317 sbi->s_log_groups_per_flex;
318
319find_close_to_parent:
320 flexbg_free_blocks = flex_group[best_flex].free_blocks;
321 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
322 if (flex_group[best_flex].free_inodes &&
323 flex_freeb_ratio > free_block_ratio)
324 goto found_flexbg;
325
326 if (best_flex && best_flex == parent_fbg_group) {
327 best_flex--;
328 goto find_close_to_parent;
329 }
330
331 for (i = 0; i < n_fbg_groups; i++) {
332 if (i == parent_fbg_group || i == parent_fbg_group - 1)
333 continue;
334
335 flexbg_free_blocks = flex_group[i].free_blocks;
336 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
337
338 if (flex_freeb_ratio > free_block_ratio &&
339 flex_group[i].free_inodes) {
340 best_flex = i;
341 goto found_flexbg;
342 }
343
344 if (best_flex < 0 ||
345 (flex_group[i].free_blocks >
346 flex_group[best_flex].free_blocks &&
347 flex_group[i].free_inodes))
348 best_flex = i;
349 }
350
351 if (!flex_group[best_flex].free_inodes ||
352 !flex_group[best_flex].free_blocks)
353 return -1;
354
355found_flexbg:
356 for (i = best_flex * flex_size; i < ngroups &&
357 i < (best_flex + 1) * flex_size; i++) {
358 desc = ext4_get_group_desc(sb, i, &bh);
359 if (le16_to_cpu(desc->bg_free_inodes_count)) {
360 *best_group = i;
361 goto out;
362 }
363 }
364
365 return -1;
366out:
367 return 0;
368}
369
289/* 370/*
290 * Orlov's allocator for directories. 371 * Orlov's allocator for directories.
291 * 372 *
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
501 struct inode *ret; 582 struct inode *ret;
502 ext4_group_t i; 583 ext4_group_t i;
503 int free = 0; 584 int free = 0;
585 ext4_group_t flex_group;
504 586
505 /* Cannot create files in a deleted directory */ 587 /* Cannot create files in a deleted directory */
506 if (!dir || !dir->i_nlink) 588 if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
514 596
515 sbi = EXT4_SB(sb); 597 sbi = EXT4_SB(sb);
516 es = sbi->s_es; 598 es = sbi->s_es;
599
600 if (sbi->s_log_groups_per_flex) {
601 ret2 = find_group_flex(sb, dir, &group);
602 goto got_group;
603 }
604
517 if (S_ISDIR(mode)) { 605 if (S_ISDIR(mode)) {
518 if (test_opt (sb, OLDALLOC)) 606 if (test_opt (sb, OLDALLOC))
519 ret2 = find_group_dir(sb, dir, &group); 607 ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
522 } else 610 } else
523 ret2 = find_group_other(sb, dir, &group); 611 ret2 = find_group_other(sb, dir, &group);
524 612
613got_group:
525 err = -ENOSPC; 614 err = -ENOSPC;
526 if (ret2 == -1) 615 if (ret2 == -1)
527 goto out; 616 goto out;
@@ -600,7 +689,7 @@ got:
600 /* We may have to initialize the block bitmap if it isn't already */ 689 /* We may have to initialize the block bitmap if it isn't already */
601 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && 690 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
602 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 691 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
603 struct buffer_head *block_bh = read_block_bitmap(sb, group); 692 struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
604 693
605 BUFFER_TRACE(block_bh, "get block bitmap access"); 694 BUFFER_TRACE(block_bh, "get block bitmap access");
606 err = ext4_journal_get_write_access(handle, block_bh); 695 err = ext4_journal_get_write_access(handle, block_bh);
@@ -676,6 +765,13 @@ got:
676 percpu_counter_inc(&sbi->s_dirs_counter); 765 percpu_counter_inc(&sbi->s_dirs_counter);
677 sb->s_dirt = 1; 766 sb->s_dirt = 1;
678 767
768 if (sbi->s_log_groups_per_flex) {
769 flex_group = ext4_flex_group(sbi, group);
770 spin_lock(sb_bgl_lock(sbi, flex_group));
771 sbi->s_flex_groups[flex_group].free_inodes--;
772 spin_unlock(sb_bgl_lock(sbi, flex_group));
773 }
774
679 inode->i_uid = current->fsuid; 775 inode->i_uid = current->fsuid;
680 if (test_opt (sb, GRPID)) 776 if (test_opt (sb, GRPID))
681 inode->i_gid = dir->i_gid; 777 inode->i_gid = dir->i_gid;
@@ -740,14 +836,10 @@ got:
740 goto fail_free_drop; 836 goto fail_free_drop;
741 837
742 if (test_opt(sb, EXTENTS)) { 838 if (test_opt(sb, EXTENTS)) {
743 /* set extent flag only for diretory, file and normal symlink*/ 839 /* set extent flag only for directory, file and normal symlink*/
744 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { 840 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
745 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 841 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
746 ext4_ext_tree_init(handle, inode); 842 ext4_ext_tree_init(handle, inode);
747 err = ext4_update_incompat_feature(handle, sb,
748 EXT4_FEATURE_INCOMPAT_EXTENTS);
749 if (err)
750 goto fail_free_drop;
751 } 843 }
752 } 844 }
753 845
@@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
817 if (IS_ERR(inode)) 909 if (IS_ERR(inode))
818 goto iget_failed; 910 goto iget_failed;
819 911
912 /*
913 * If the orphans has i_nlinks > 0 then it should be able to be
914 * truncated, otherwise it won't be removed from the orphan list
915 * during processing and an infinite loop will result.
916 */
917 if (inode->i_nlink && !ext4_can_truncate(inode))
918 goto bad_orphan;
919
820 if (NEXT_ORPHAN(inode) > max_ino) 920 if (NEXT_ORPHAN(inode) > max_ino)
821 goto bad_orphan; 921 goto bad_orphan;
822 brelse(bitmap_bh); 922 brelse(bitmap_bh);
@@ -838,6 +938,7 @@ bad_orphan:
838 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", 938 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
839 NEXT_ORPHAN(inode)); 939 NEXT_ORPHAN(inode));
840 printk(KERN_NOTICE "max_ino=%lu\n", max_ino); 940 printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
941 printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
841 /* Avoid freeing blocks if we got a bad deleted inode */ 942 /* Avoid freeing blocks if we got a bad deleted inode */
842 if (inode->i_nlink == 0) 943 if (inode->i_nlink == 0)
843 inode->i_blocks = 0; 944 inode->i_blocks = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d9707746413..8ca2763df091 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,23 @@
32#include <linux/string.h> 32#include <linux/string.h>
33#include <linux/buffer_head.h> 33#include <linux/buffer_head.h>
34#include <linux/writeback.h> 34#include <linux/writeback.h>
35#include <linux/pagevec.h>
35#include <linux/mpage.h> 36#include <linux/mpage.h>
36#include <linux/uio.h> 37#include <linux/uio.h>
37#include <linux/bio.h> 38#include <linux/bio.h>
38#include "ext4_jbd2.h" 39#include "ext4_jbd2.h"
39#include "xattr.h" 40#include "xattr.h"
40#include "acl.h" 41#include "acl.h"
42#include "ext4_extents.h"
43
44static inline int ext4_begin_ordered_truncate(struct inode *inode,
45 loff_t new_size)
46{
47 return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
48 new_size);
49}
50
51static void ext4_invalidatepage(struct page *page, unsigned long offset);
41 52
42/* 53/*
43 * Test whether an inode is a fast symlink. 54 * Test whether an inode is a fast symlink.
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode)
181{ 192{
182 handle_t *handle; 193 handle_t *handle;
183 194
195 if (ext4_should_order_data(inode))
196 ext4_begin_ordered_truncate(inode, 0);
184 truncate_inode_pages(&inode->i_data, 0); 197 truncate_inode_pages(&inode->i_data, 0);
185 198
186 if (is_bad_inode(inode)) 199 if (is_bad_inode(inode))
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
508 * direct blocks 521 * direct blocks
509 */ 522 */
510static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 523static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
511 ext4_fsblk_t goal, int indirect_blks, int blks, 524 ext4_lblk_t iblock, ext4_fsblk_t goal,
512 ext4_fsblk_t new_blocks[4], int *err) 525 int indirect_blks, int blks,
526 ext4_fsblk_t new_blocks[4], int *err)
513{ 527{
514 int target, i; 528 int target, i;
515 unsigned long count = 0; 529 unsigned long count = 0, blk_allocated = 0;
516 int index = 0; 530 int index = 0;
517 ext4_fsblk_t current_block = 0; 531 ext4_fsblk_t current_block = 0;
518 int ret = 0; 532 int ret = 0;
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
525 * the first direct block of this branch. That's the 539 * the first direct block of this branch. That's the
526 * minimum number of blocks need to allocate(required) 540 * minimum number of blocks need to allocate(required)
527 */ 541 */
528 target = blks + indirect_blks; 542 /* first we try to allocate the indirect blocks */
529 543 target = indirect_blks;
530 while (1) { 544 while (target > 0) {
531 count = target; 545 count = target;
532 /* allocating blocks for indirect blocks and direct blocks */ 546 /* allocating blocks for indirect blocks and direct blocks */
533 current_block = ext4_new_blocks(handle,inode,goal,&count,err); 547 current_block = ext4_new_meta_blocks(handle, inode,
548 goal, &count, err);
534 if (*err) 549 if (*err)
535 goto failed_out; 550 goto failed_out;
536 551
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
540 new_blocks[index++] = current_block++; 555 new_blocks[index++] = current_block++;
541 count--; 556 count--;
542 } 557 }
543 558 if (count > 0) {
544 if (count > 0) 559 /*
560 * save the new block number
561 * for the first direct block
562 */
563 new_blocks[index] = current_block;
564 printk(KERN_INFO "%s returned more blocks than "
565 "requested\n", __func__);
566 WARN_ON(1);
545 break; 567 break;
568 }
546 } 569 }
547 570
548 /* save the new block number for the first direct block */ 571 target = blks - count ;
549 new_blocks[index] = current_block; 572 blk_allocated = count;
550 573 if (!target)
574 goto allocated;
575 /* Now allocate data blocks */
576 count = target;
577 /* allocating blocks for data blocks */
578 current_block = ext4_new_blocks(handle, inode, iblock,
579 goal, &count, err);
580 if (*err && (target == blks)) {
581 /*
582 * if the allocation failed and we didn't allocate
583 * any blocks before
584 */
585 goto failed_out;
586 }
587 if (!*err) {
588 if (target == blks) {
589 /*
590 * save the new block number
591 * for the first direct block
592 */
593 new_blocks[index] = current_block;
594 }
595 blk_allocated += count;
596 }
597allocated:
551 /* total number of blocks allocated for direct blocks */ 598 /* total number of blocks allocated for direct blocks */
552 ret = count; 599 ret = blk_allocated;
553 *err = 0; 600 *err = 0;
554 return ret; 601 return ret;
555failed_out: 602failed_out:
@@ -584,8 +631,9 @@ failed_out:
584 * as described above and return 0. 631 * as described above and return 0.
585 */ 632 */
586static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 633static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
587 int indirect_blks, int *blks, ext4_fsblk_t goal, 634 ext4_lblk_t iblock, int indirect_blks,
588 ext4_lblk_t *offsets, Indirect *branch) 635 int *blks, ext4_fsblk_t goal,
636 ext4_lblk_t *offsets, Indirect *branch)
589{ 637{
590 int blocksize = inode->i_sb->s_blocksize; 638 int blocksize = inode->i_sb->s_blocksize;
591 int i, n = 0; 639 int i, n = 0;
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
595 ext4_fsblk_t new_blocks[4]; 643 ext4_fsblk_t new_blocks[4];
596 ext4_fsblk_t current_block; 644 ext4_fsblk_t current_block;
597 645
598 num = ext4_alloc_blocks(handle, inode, goal, indirect_blks, 646 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
599 *blks, new_blocks, &err); 647 *blks, new_blocks, &err);
600 if (err) 648 if (err)
601 return err; 649 return err;
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
799 struct ext4_inode_info *ei = EXT4_I(inode); 847 struct ext4_inode_info *ei = EXT4_I(inode);
800 int count = 0; 848 int count = 0;
801 ext4_fsblk_t first_block = 0; 849 ext4_fsblk_t first_block = 0;
850 loff_t disksize;
802 851
803 852
804 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 853 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
855 /* 904 /*
856 * Block out ext4_truncate while we alter the tree 905 * Block out ext4_truncate while we alter the tree
857 */ 906 */
858 err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal, 907 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
859 offsets + (partial - chain), partial); 908 &count, goal,
909 offsets + (partial - chain), partial);
860 910
861 /* 911 /*
862 * The ext4_splice_branch call will free and forget any buffers 912 * The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
873 * protect it if you're about to implement concurrent 923 * protect it if you're about to implement concurrent
874 * ext4_get_block() -bzzz 924 * ext4_get_block() -bzzz
875 */ 925 */
876 if (!err && extend_disksize && inode->i_size > ei->i_disksize) 926 if (!err && extend_disksize) {
877 ei->i_disksize = inode->i_size; 927 disksize = ((loff_t) iblock + count) << inode->i_blkbits;
928 if (disksize > i_size_read(inode))
929 disksize = i_size_read(inode);
930 if (disksize > ei->i_disksize)
931 ei->i_disksize = disksize;
932 }
878 if (err) 933 if (err)
879 goto cleanup; 934 goto cleanup;
880 935
@@ -934,7 +989,7 @@ out:
934 */ 989 */
935int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 990int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
936 unsigned long max_blocks, struct buffer_head *bh, 991 unsigned long max_blocks, struct buffer_head *bh,
937 int create, int extend_disksize) 992 int create, int extend_disksize, int flag)
938{ 993{
939 int retval; 994 int retval;
940 995
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
975 * with create == 1 flag. 1030 * with create == 1 flag.
976 */ 1031 */
977 down_write((&EXT4_I(inode)->i_data_sem)); 1032 down_write((&EXT4_I(inode)->i_data_sem));
1033
1034 /*
1035 * if the caller is from delayed allocation writeout path
1036 * we have already reserved fs blocks for allocation
1037 * let the underlying get_block() function know to
1038 * avoid double accounting
1039 */
1040 if (flag)
1041 EXT4_I(inode)->i_delalloc_reserved_flag = 1;
978 /* 1042 /*
979 * We need to check for EXT4 here because migrate 1043 * We need to check for EXT4 here because migrate
980 * could have changed the inode type in between 1044 * could have changed the inode type in between
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
996 ~EXT4_EXT_MIGRATE; 1060 ~EXT4_EXT_MIGRATE;
997 } 1061 }
998 } 1062 }
1063
1064 if (flag) {
1065 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1066 /*
1067 * Update reserved blocks/metadata blocks
1068 * after successful block allocation
1069 * which were deferred till now
1070 */
1071 if ((retval > 0) && buffer_delay(bh))
1072 ext4_da_release_space(inode, retval, 0);
1073 }
1074
999 up_write((&EXT4_I(inode)->i_data_sem)); 1075 up_write((&EXT4_I(inode)->i_data_sem));
1000 return retval; 1076 return retval;
1001} 1077}
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
1021 } 1097 }
1022 1098
1023 ret = ext4_get_blocks_wrap(handle, inode, iblock, 1099 ret = ext4_get_blocks_wrap(handle, inode, iblock,
1024 max_blocks, bh_result, create, 0); 1100 max_blocks, bh_result, create, 0, 0);
1025 if (ret > 0) { 1101 if (ret > 0) {
1026 bh_result->b_size = (ret << inode->i_blkbits); 1102 bh_result->b_size = (ret << inode->i_blkbits);
1027 ret = 0; 1103 ret = 0;
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1047 dummy.b_blocknr = -1000; 1123 dummy.b_blocknr = -1000;
1048 buffer_trace_init(&dummy.b_history); 1124 buffer_trace_init(&dummy.b_history);
1049 err = ext4_get_blocks_wrap(handle, inode, block, 1, 1125 err = ext4_get_blocks_wrap(handle, inode, block, 1,
1050 &dummy, create, 1); 1126 &dummy, create, 1, 0);
1051 /* 1127 /*
1052 * ext4_get_blocks_handle() returns number of blocks 1128 * ext4_get_blocks_handle() returns number of blocks
1053 * mapped. 0 in case of a HOLE. 1129 * mapped. 0 in case of a HOLE.
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
1203 to = from + len; 1279 to = from + len;
1204 1280
1205retry: 1281retry:
1206 page = __grab_cache_page(mapping, index);
1207 if (!page)
1208 return -ENOMEM;
1209 *pagep = page;
1210
1211 handle = ext4_journal_start(inode, needed_blocks); 1282 handle = ext4_journal_start(inode, needed_blocks);
1212 if (IS_ERR(handle)) { 1283 if (IS_ERR(handle)) {
1213 unlock_page(page);
1214 page_cache_release(page);
1215 ret = PTR_ERR(handle); 1284 ret = PTR_ERR(handle);
1216 goto out; 1285 goto out;
1217 } 1286 }
1218 1287
1288 page = __grab_cache_page(mapping, index);
1289 if (!page) {
1290 ext4_journal_stop(handle);
1291 ret = -ENOMEM;
1292 goto out;
1293 }
1294 *pagep = page;
1295
1219 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1296 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1220 ext4_get_block); 1297 ext4_get_block);
1221 1298
@@ -1225,8 +1302,8 @@ retry:
1225 } 1302 }
1226 1303
1227 if (ret) { 1304 if (ret) {
1228 ext4_journal_stop(handle);
1229 unlock_page(page); 1305 unlock_page(page);
1306 ext4_journal_stop(handle);
1230 page_cache_release(page); 1307 page_cache_release(page);
1231 } 1308 }
1232 1309
@@ -1236,15 +1313,6 @@ out:
1236 return ret; 1313 return ret;
1237} 1314}
1238 1315
1239int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1240{
1241 int err = jbd2_journal_dirty_data(handle, bh);
1242 if (err)
1243 ext4_journal_abort_handle(__func__, __func__,
1244 bh, handle, err);
1245 return err;
1246}
1247
1248/* For write_end() in data=journal mode */ 1316/* For write_end() in data=journal mode */
1249static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1317static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1250{ 1318{
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
1255} 1323}
1256 1324
1257/* 1325/*
1258 * Generic write_end handler for ordered and writeback ext4 journal modes.
1259 * We can't use generic_write_end, because that unlocks the page and we need to
1260 * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
1261 * after block_write_end.
1262 */
1263static int ext4_generic_write_end(struct file *file,
1264 struct address_space *mapping,
1265 loff_t pos, unsigned len, unsigned copied,
1266 struct page *page, void *fsdata)
1267{
1268 struct inode *inode = file->f_mapping->host;
1269
1270 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1271
1272 if (pos+copied > inode->i_size) {
1273 i_size_write(inode, pos+copied);
1274 mark_inode_dirty(inode);
1275 }
1276
1277 return copied;
1278}
1279
1280/*
1281 * We need to pick up the new inode size which generic_commit_write gave us 1326 * We need to pick up the new inode size which generic_commit_write gave us
1282 * `file' can be NULL - eg, when called from page_symlink(). 1327 * `file' can be NULL - eg, when called from page_symlink().
1283 * 1328 *
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file,
1290 struct page *page, void *fsdata) 1335 struct page *page, void *fsdata)
1291{ 1336{
1292 handle_t *handle = ext4_journal_current_handle(); 1337 handle_t *handle = ext4_journal_current_handle();
1293 struct inode *inode = file->f_mapping->host; 1338 struct inode *inode = mapping->host;
1294 unsigned from, to; 1339 unsigned from, to;
1295 int ret = 0, ret2; 1340 int ret = 0, ret2;
1296 1341
1297 from = pos & (PAGE_CACHE_SIZE - 1); 1342 from = pos & (PAGE_CACHE_SIZE - 1);
1298 to = from + len; 1343 to = from + len;
1299 1344
1300 ret = walk_page_buffers(handle, page_buffers(page), 1345 ret = ext4_jbd2_file_inode(handle, inode);
1301 from, to, NULL, ext4_journal_dirty_data);
1302 1346
1303 if (ret == 0) { 1347 if (ret == 0) {
1304 /* 1348 /*
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file,
1311 new_i_size = pos + copied; 1355 new_i_size = pos + copied;
1312 if (new_i_size > EXT4_I(inode)->i_disksize) 1356 if (new_i_size > EXT4_I(inode)->i_disksize)
1313 EXT4_I(inode)->i_disksize = new_i_size; 1357 EXT4_I(inode)->i_disksize = new_i_size;
1314 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1358 ret2 = generic_write_end(file, mapping, pos, len, copied,
1315 page, fsdata); 1359 page, fsdata);
1316 copied = ret2; 1360 copied = ret2;
1317 if (ret2 < 0) 1361 if (ret2 < 0)
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file,
1320 ret2 = ext4_journal_stop(handle); 1364 ret2 = ext4_journal_stop(handle);
1321 if (!ret) 1365 if (!ret)
1322 ret = ret2; 1366 ret = ret2;
1323 unlock_page(page);
1324 page_cache_release(page);
1325 1367
1326 return ret ? ret : copied; 1368 return ret ? ret : copied;
1327} 1369}
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file,
1332 struct page *page, void *fsdata) 1374 struct page *page, void *fsdata)
1333{ 1375{
1334 handle_t *handle = ext4_journal_current_handle(); 1376 handle_t *handle = ext4_journal_current_handle();
1335 struct inode *inode = file->f_mapping->host; 1377 struct inode *inode = mapping->host;
1336 int ret = 0, ret2; 1378 int ret = 0, ret2;
1337 loff_t new_i_size; 1379 loff_t new_i_size;
1338 1380
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file,
1340 if (new_i_size > EXT4_I(inode)->i_disksize) 1382 if (new_i_size > EXT4_I(inode)->i_disksize)
1341 EXT4_I(inode)->i_disksize = new_i_size; 1383 EXT4_I(inode)->i_disksize = new_i_size;
1342 1384
1343 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, 1385 ret2 = generic_write_end(file, mapping, pos, len, copied,
1344 page, fsdata); 1386 page, fsdata);
1345 copied = ret2; 1387 copied = ret2;
1346 if (ret2 < 0) 1388 if (ret2 < 0)
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file,
1349 ret2 = ext4_journal_stop(handle); 1391 ret2 = ext4_journal_stop(handle);
1350 if (!ret) 1392 if (!ret)
1351 ret = ret2; 1393 ret = ret2;
1352 unlock_page(page);
1353 page_cache_release(page);
1354 1394
1355 return ret ? ret : copied; 1395 return ret ? ret : copied;
1356} 1396}
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file,
1389 ret = ret2; 1429 ret = ret2;
1390 } 1430 }
1391 1431
1432 unlock_page(page);
1392 ret2 = ext4_journal_stop(handle); 1433 ret2 = ext4_journal_stop(handle);
1393 if (!ret) 1434 if (!ret)
1394 ret = ret2; 1435 ret = ret2;
1395 unlock_page(page);
1396 page_cache_release(page); 1436 page_cache_release(page);
1397 1437
1398 return ret ? ret : copied; 1438 return ret ? ret : copied;
1399} 1439}
1440/*
1441 * Calculate the number of metadata blocks need to reserve
1442 * to allocate @blocks for non extent file based file
1443 */
1444static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
1445{
1446 int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1447 int ind_blks, dind_blks, tind_blks;
1448
1449 /* number of new indirect blocks needed */
1450 ind_blks = (blocks + icap - 1) / icap;
1451
1452 dind_blks = (ind_blks + icap - 1) / icap;
1453
1454 tind_blks = 1;
1455
1456 return ind_blks + dind_blks + tind_blks;
1457}
1458
1459/*
1460 * Calculate the number of metadata blocks need to reserve
1461 * to allocate given number of blocks
1462 */
1463static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
1464{
1465 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
1466 return ext4_ext_calc_metadata_amount(inode, blocks);
1467
1468 return ext4_indirect_calc_metadata_amount(inode, blocks);
1469}
1470
1471static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1472{
1473 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1474 unsigned long md_needed, mdblocks, total = 0;
1475
1476 /*
1477 * recalculate the amount of metadata blocks to reserve
1478 * in order to allocate nrblocks
1479 * worse case is one extent per block
1480 */
1481 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1482 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
1483 mdblocks = ext4_calc_metadata_amount(inode, total);
1484 BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
1485
1486 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1487 total = md_needed + nrblocks;
1488
1489 if (ext4_has_free_blocks(sbi, total) < total) {
1490 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1491 return -ENOSPC;
1492 }
1493
1494 /* reduce fs free blocks counter */
1495 percpu_counter_sub(&sbi->s_freeblocks_counter, total);
1496
1497 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
1498 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
1499
1500 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1501 return 0; /* success */
1502}
1503
1504void ext4_da_release_space(struct inode *inode, int used, int to_free)
1505{
1506 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1507 int total, mdb, mdb_free, release;
1508
1509 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1510 /* recalculate the number of metablocks still need to be reserved */
1511 total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
1512 mdb = ext4_calc_metadata_amount(inode, total);
1513
1514 /* figure out how many metablocks to release */
1515 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1516 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
1517
1518 /* Account for allocated meta_blocks */
1519 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
1520
1521 release = to_free + mdb_free;
1522
1523 /* update fs free blocks counter for truncate case */
1524 percpu_counter_add(&sbi->s_freeblocks_counter, release);
1525
1526 /* update per-inode reservations */
1527 BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
1528 EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
1529
1530 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1531 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1532 EXT4_I(inode)->i_allocated_meta_blocks = 0;
1533 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1534}
1535
1536static void ext4_da_page_release_reservation(struct page *page,
1537 unsigned long offset)
1538{
1539 int to_release = 0;
1540 struct buffer_head *head, *bh;
1541 unsigned int curr_off = 0;
1542
1543 head = page_buffers(page);
1544 bh = head;
1545 do {
1546 unsigned int next_off = curr_off + bh->b_size;
1547
1548 if ((offset <= curr_off) && (buffer_delay(bh))) {
1549 to_release++;
1550 clear_buffer_delay(bh);
1551 }
1552 curr_off = next_off;
1553 } while ((bh = bh->b_this_page) != head);
1554 ext4_da_release_space(page->mapping->host, 0, to_release);
1555}
1556
1557/*
1558 * Delayed allocation stuff
1559 */
1560
1561struct mpage_da_data {
1562 struct inode *inode;
1563 struct buffer_head lbh; /* extent of blocks */
1564 unsigned long first_page, next_page; /* extent of pages */
1565 get_block_t *get_block;
1566 struct writeback_control *wbc;
1567};
1568
1569/*
1570 * mpage_da_submit_io - walks through extent of pages and try to write
1571 * them with __mpage_writepage()
1572 *
1573 * @mpd->inode: inode
1574 * @mpd->first_page: first page of the extent
1575 * @mpd->next_page: page after the last page of the extent
1576 * @mpd->get_block: the filesystem's block mapper function
1577 *
1578 * By the time mpage_da_submit_io() is called we expect all blocks
1579 * to be allocated. this may be wrong if allocation failed.
1580 *
1581 * As pages are already locked by write_cache_pages(), we can't use it
1582 */
1583static int mpage_da_submit_io(struct mpage_da_data *mpd)
1584{
1585 struct address_space *mapping = mpd->inode->i_mapping;
1586 struct mpage_data mpd_pp = {
1587 .bio = NULL,
1588 .last_block_in_bio = 0,
1589 .get_block = mpd->get_block,
1590 .use_writepage = 1,
1591 };
1592 int ret = 0, err, nr_pages, i;
1593 unsigned long index, end;
1594 struct pagevec pvec;
1595
1596 BUG_ON(mpd->next_page <= mpd->first_page);
1597
1598 pagevec_init(&pvec, 0);
1599 index = mpd->first_page;
1600 end = mpd->next_page - 1;
1601
1602 while (index <= end) {
1603 /* XXX: optimize tail */
1604 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1605 if (nr_pages == 0)
1606 break;
1607 for (i = 0; i < nr_pages; i++) {
1608 struct page *page = pvec.pages[i];
1609
1610 index = page->index;
1611 if (index > end)
1612 break;
1613 index++;
1614
1615 err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
1616
1617 /*
1618 * In error case, we have to continue because
1619 * remaining pages are still locked
1620 * XXX: unlock and re-dirty them?
1621 */
1622 if (ret == 0)
1623 ret = err;
1624 }
1625 pagevec_release(&pvec);
1626 }
1627 if (mpd_pp.bio)
1628 mpage_bio_submit(WRITE, mpd_pp.bio);
1629
1630 return ret;
1631}
1632
1633/*
1634 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
1635 *
1636 * @mpd->inode - inode to walk through
1637 * @exbh->b_blocknr - first block on a disk
1638 * @exbh->b_size - amount of space in bytes
1639 * @logical - first logical block to start assignment with
1640 *
1641 * the function goes through all passed space and put actual disk
1642 * block numbers into buffer heads, dropping BH_Delay
1643 */
1644static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
1645 struct buffer_head *exbh)
1646{
1647 struct inode *inode = mpd->inode;
1648 struct address_space *mapping = inode->i_mapping;
1649 int blocks = exbh->b_size >> inode->i_blkbits;
1650 sector_t pblock = exbh->b_blocknr, cur_logical;
1651 struct buffer_head *head, *bh;
1652 unsigned long index, end;
1653 struct pagevec pvec;
1654 int nr_pages, i;
1655
1656 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1657 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
1658 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1659
1660 pagevec_init(&pvec, 0);
1661
1662 while (index <= end) {
1663 /* XXX: optimize tail */
1664 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1665 if (nr_pages == 0)
1666 break;
1667 for (i = 0; i < nr_pages; i++) {
1668 struct page *page = pvec.pages[i];
1669
1670 index = page->index;
1671 if (index > end)
1672 break;
1673 index++;
1674
1675 BUG_ON(!PageLocked(page));
1676 BUG_ON(PageWriteback(page));
1677 BUG_ON(!page_has_buffers(page));
1678
1679 bh = page_buffers(page);
1680 head = bh;
1681
1682 /* skip blocks out of the range */
1683 do {
1684 if (cur_logical >= logical)
1685 break;
1686 cur_logical++;
1687 } while ((bh = bh->b_this_page) != head);
1688
1689 do {
1690 if (cur_logical >= logical + blocks)
1691 break;
1692 if (buffer_delay(bh)) {
1693 bh->b_blocknr = pblock;
1694 clear_buffer_delay(bh);
1695 } else if (buffer_mapped(bh))
1696 BUG_ON(bh->b_blocknr != pblock);
1697
1698 cur_logical++;
1699 pblock++;
1700 } while ((bh = bh->b_this_page) != head);
1701 }
1702 pagevec_release(&pvec);
1703 }
1704}
1705
1706
1707/*
1708 * __unmap_underlying_blocks - just a helper function to unmap
1709 * set of blocks described by @bh
1710 */
1711static inline void __unmap_underlying_blocks(struct inode *inode,
1712 struct buffer_head *bh)
1713{
1714 struct block_device *bdev = inode->i_sb->s_bdev;
1715 int blocks, i;
1716
1717 blocks = bh->b_size >> inode->i_blkbits;
1718 for (i = 0; i < blocks; i++)
1719 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
1720}
1721
1722/*
1723 * mpage_da_map_blocks - go through given space
1724 *
1725 * @mpd->lbh - bh describing space
1726 * @mpd->get_block - the filesystem's block mapper function
1727 *
1728 * The function skips space we know is already mapped to disk blocks.
1729 *
1730 * The function ignores errors ->get_block() returns, thus real
1731 * error handling is postponed to __mpage_writepage()
1732 */
1733static void mpage_da_map_blocks(struct mpage_da_data *mpd)
1734{
1735 struct buffer_head *lbh = &mpd->lbh;
1736 int err = 0, remain = lbh->b_size;
1737 sector_t next = lbh->b_blocknr;
1738 struct buffer_head new;
1739
1740 /*
1741 * We consider only non-mapped and non-allocated blocks
1742 */
1743 if (buffer_mapped(lbh) && !buffer_delay(lbh))
1744 return;
1745
1746 while (remain) {
1747 new.b_state = lbh->b_state;
1748 new.b_blocknr = 0;
1749 new.b_size = remain;
1750 err = mpd->get_block(mpd->inode, next, &new, 1);
1751 if (err) {
1752 /*
1753 * Rather than implement own error handling
1754 * here, we just leave remaining blocks
1755 * unallocated and try again with ->writepage()
1756 */
1757 break;
1758 }
1759 BUG_ON(new.b_size == 0);
1760
1761 if (buffer_new(&new))
1762 __unmap_underlying_blocks(mpd->inode, &new);
1763
1764 /*
1765 * If blocks are delayed marked, we need to
1766 * put actual blocknr and drop delayed bit
1767 */
1768 if (buffer_delay(lbh))
1769 mpage_put_bnr_to_bhs(mpd, next, &new);
1770
1771 /* go for the remaining blocks */
1772 next += new.b_size >> mpd->inode->i_blkbits;
1773 remain -= new.b_size;
1774 }
1775}
1776
1777#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
1778
1779/*
1780 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1781 *
1782 * @mpd->lbh - extent of blocks
1783 * @logical - logical number of the block in the file
1784 * @bh - bh of the block (used to access block's state)
1785 *
1786 * the function is used to collect contig. blocks in same state
1787 */
1788static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1789 sector_t logical, struct buffer_head *bh)
1790{
1791 struct buffer_head *lbh = &mpd->lbh;
1792 sector_t next;
1793
1794 next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
1795
1796 /*
1797 * First block in the extent
1798 */
1799 if (lbh->b_size == 0) {
1800 lbh->b_blocknr = logical;
1801 lbh->b_size = bh->b_size;
1802 lbh->b_state = bh->b_state & BH_FLAGS;
1803 return;
1804 }
1805
1806 /*
1807 * Can we merge the block to our big extent?
1808 */
1809 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
1810 lbh->b_size += bh->b_size;
1811 return;
1812 }
1813
1814 /*
1815 * We couldn't merge the block to our extent, so we
1816 * need to flush current extent and start new one
1817 */
1818 mpage_da_map_blocks(mpd);
1819
1820 /*
1821 * Now start a new extent
1822 */
1823 lbh->b_size = bh->b_size;
1824 lbh->b_state = bh->b_state & BH_FLAGS;
1825 lbh->b_blocknr = logical;
1826}
1827
1828/*
1829 * __mpage_da_writepage - finds extent of pages and blocks
1830 *
1831 * @page: page to consider
1832 * @wbc: not used, we just follow rules
1833 * @data: context
1834 *
1835 * The function finds extents of pages and scan them for all blocks.
1836 */
1837static int __mpage_da_writepage(struct page *page,
1838 struct writeback_control *wbc, void *data)
1839{
1840 struct mpage_da_data *mpd = data;
1841 struct inode *inode = mpd->inode;
1842 struct buffer_head *bh, *head, fake;
1843 sector_t logical;
1844
1845 /*
1846 * Can we merge this page to current extent?
1847 */
1848 if (mpd->next_page != page->index) {
1849 /*
1850 * Nope, we can't. So, we map non-allocated blocks
1851 * and start IO on them using __mpage_writepage()
1852 */
1853 if (mpd->next_page != mpd->first_page) {
1854 mpage_da_map_blocks(mpd);
1855 mpage_da_submit_io(mpd);
1856 }
1857
1858 /*
1859 * Start next extent of pages ...
1860 */
1861 mpd->first_page = page->index;
1862
1863 /*
1864 * ... and blocks
1865 */
1866 mpd->lbh.b_size = 0;
1867 mpd->lbh.b_state = 0;
1868 mpd->lbh.b_blocknr = 0;
1869 }
1870
1871 mpd->next_page = page->index + 1;
1872 logical = (sector_t) page->index <<
1873 (PAGE_CACHE_SHIFT - inode->i_blkbits);
1874
1875 if (!page_has_buffers(page)) {
1876 /*
1877 * There is no attached buffer heads yet (mmap?)
1878 * we treat the page asfull of dirty blocks
1879 */
1880 bh = &fake;
1881 bh->b_size = PAGE_CACHE_SIZE;
1882 bh->b_state = 0;
1883 set_buffer_dirty(bh);
1884 set_buffer_uptodate(bh);
1885 mpage_add_bh_to_extent(mpd, logical, bh);
1886 } else {
1887 /*
1888 * Page with regular buffer heads, just add all dirty ones
1889 */
1890 head = page_buffers(page);
1891 bh = head;
1892 do {
1893 BUG_ON(buffer_locked(bh));
1894 if (buffer_dirty(bh))
1895 mpage_add_bh_to_extent(mpd, logical, bh);
1896 logical++;
1897 } while ((bh = bh->b_this_page) != head);
1898 }
1899
1900 return 0;
1901}
1902
1903/*
1904 * mpage_da_writepages - walk the list of dirty pages of the given
1905 * address space, allocates non-allocated blocks, maps newly-allocated
1906 * blocks to existing bhs and issue IO them
1907 *
1908 * @mapping: address space structure to write
1909 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
1910 * @get_block: the filesystem's block mapper function.
1911 *
1912 * This is a library function, which implements the writepages()
1913 * address_space_operation.
1914 *
1915 * In order to avoid duplication of logic that deals with partial pages,
1916 * multiple bio per page, etc, we find non-allocated blocks, allocate
1917 * them with minimal calls to ->get_block() and re-use __mpage_writepage()
1918 *
1919 * It's important that we call __mpage_writepage() only once for each
1920 * involved page, otherwise we'd have to implement more complicated logic
1921 * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
1922 *
1923 * See comments to mpage_writepages()
1924 */
1925static int mpage_da_writepages(struct address_space *mapping,
1926 struct writeback_control *wbc,
1927 get_block_t get_block)
1928{
1929 struct mpage_da_data mpd;
1930 int ret;
1931
1932 if (!get_block)
1933 return generic_writepages(mapping, wbc);
1934
1935 mpd.wbc = wbc;
1936 mpd.inode = mapping->host;
1937 mpd.lbh.b_size = 0;
1938 mpd.lbh.b_state = 0;
1939 mpd.lbh.b_blocknr = 0;
1940 mpd.first_page = 0;
1941 mpd.next_page = 0;
1942 mpd.get_block = get_block;
1943
1944 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
1945
1946 /*
1947 * Handle last extent of pages
1948 */
1949 if (mpd.next_page != mpd.first_page) {
1950 mpage_da_map_blocks(&mpd);
1951 mpage_da_submit_io(&mpd);
1952 }
1953
1954 return ret;
1955}
1956
1957/*
1958 * this is a special callback for ->write_begin() only
1959 * it's intention is to return mapped block or reserve space
1960 */
1961static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1962 struct buffer_head *bh_result, int create)
1963{
1964 int ret = 0;
1965
1966 BUG_ON(create == 0);
1967 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
1968
1969 /*
1970 * first, we need to know whether the block is allocated already
1971 * preallocated blocks are unmapped but should treated
1972 * the same as allocated blocks.
1973 */
1974 ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0);
1975 if ((ret == 0) && !buffer_delay(bh_result)) {
1976 /* the block isn't (pre)allocated yet, let's reserve space */
1977 /*
1978 * XXX: __block_prepare_write() unmaps passed block,
1979 * is it OK?
1980 */
1981 ret = ext4_da_reserve_space(inode, 1);
1982 if (ret)
1983 /* not enough space to reserve */
1984 return ret;
1985
1986 map_bh(bh_result, inode->i_sb, 0);
1987 set_buffer_new(bh_result);
1988 set_buffer_delay(bh_result);
1989 } else if (ret > 0) {
1990 bh_result->b_size = (ret << inode->i_blkbits);
1991 ret = 0;
1992 }
1993
1994 return ret;
1995}
1996#define EXT4_DELALLOC_RSVED 1
1997static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
1998 struct buffer_head *bh_result, int create)
1999{
2000 int ret;
2001 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2002 loff_t disksize = EXT4_I(inode)->i_disksize;
2003 handle_t *handle = NULL;
2004
2005 handle = ext4_journal_current_handle();
2006 if (!handle) {
2007 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2008 bh_result, 0, 0, 0);
2009 BUG_ON(!ret);
2010 } else {
2011 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
2012 bh_result, create, 0, EXT4_DELALLOC_RSVED);
2013 }
2014
2015 if (ret > 0) {
2016 bh_result->b_size = (ret << inode->i_blkbits);
2017
2018 /*
2019 * Update on-disk size along with block allocation
2020 * we don't use 'extend_disksize' as size may change
2021 * within already allocated block -bzzz
2022 */
2023 disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
2024 if (disksize > i_size_read(inode))
2025 disksize = i_size_read(inode);
2026 if (disksize > EXT4_I(inode)->i_disksize) {
2027 /*
2028 * XXX: replace with spinlock if seen contended -bzzz
2029 */
2030 down_write(&EXT4_I(inode)->i_data_sem);
2031 if (disksize > EXT4_I(inode)->i_disksize)
2032 EXT4_I(inode)->i_disksize = disksize;
2033 up_write(&EXT4_I(inode)->i_data_sem);
2034
2035 if (EXT4_I(inode)->i_disksize == disksize) {
2036 ret = ext4_mark_inode_dirty(handle, inode);
2037 return ret;
2038 }
2039 }
2040 ret = 0;
2041 }
2042 return ret;
2043}
2044
2045static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
2046{
2047 /*
2048 * unmapped buffer is possible for holes.
2049 * delay buffer is possible with delayed allocation
2050 */
2051 return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
2052}
2053
2054static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
2055 struct buffer_head *bh_result, int create)
2056{
2057 int ret = 0;
2058 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2059
2060 /*
2061 * we don't want to do block allocation in writepage
2062 * so call get_block_wrap with create = 0
2063 */
2064 ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
2065 bh_result, 0, 0, 0);
2066 if (ret > 0) {
2067 bh_result->b_size = (ret << inode->i_blkbits);
2068 ret = 0;
2069 }
2070 return ret;
2071}
2072
2073/*
2074 * get called vi ext4_da_writepages after taking page lock (have journal handle)
2075 * get called via journal_submit_inode_data_buffers (no journal handle)
2076 * get called via shrink_page_list via pdflush (no journal handle)
2077 * or grab_page_cache when doing write_begin (have journal handle)
2078 */
2079static int ext4_da_writepage(struct page *page,
2080 struct writeback_control *wbc)
2081{
2082 int ret = 0;
2083 loff_t size;
2084 unsigned long len;
2085 struct buffer_head *page_bufs;
2086 struct inode *inode = page->mapping->host;
2087
2088 size = i_size_read(inode);
2089 if (page->index == size >> PAGE_CACHE_SHIFT)
2090 len = size & ~PAGE_CACHE_MASK;
2091 else
2092 len = PAGE_CACHE_SIZE;
2093
2094 if (page_has_buffers(page)) {
2095 page_bufs = page_buffers(page);
2096 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2097 ext4_bh_unmapped_or_delay)) {
2098 /*
2099 * We don't want to do block allocation
2100 * So redirty the page and return
2101 * We may reach here when we do a journal commit
2102 * via journal_submit_inode_data_buffers.
2103 * If we don't have mapping block we just ignore
2104 * them. We can also reach here via shrink_page_list
2105 */
2106 redirty_page_for_writepage(wbc, page);
2107 unlock_page(page);
2108 return 0;
2109 }
2110 } else {
2111 /*
2112 * The test for page_has_buffers() is subtle:
2113 * We know the page is dirty but it lost buffers. That means
2114 * that at some moment in time after write_begin()/write_end()
2115 * has been called all buffers have been clean and thus they
2116 * must have been written at least once. So they are all
2117 * mapped and we can happily proceed with mapping them
2118 * and writing the page.
2119 *
2120 * Try to initialize the buffer_heads and check whether
2121 * all are mapped and non delay. We don't want to
2122 * do block allocation here.
2123 */
2124 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
2125 ext4_normal_get_block_write);
2126 if (!ret) {
2127 page_bufs = page_buffers(page);
2128 /* check whether all are mapped and non delay */
2129 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2130 ext4_bh_unmapped_or_delay)) {
2131 redirty_page_for_writepage(wbc, page);
2132 unlock_page(page);
2133 return 0;
2134 }
2135 } else {
2136 /*
2137 * We can't do block allocation here
2138 * so just redity the page and unlock
2139 * and return
2140 */
2141 redirty_page_for_writepage(wbc, page);
2142 unlock_page(page);
2143 return 0;
2144 }
2145 }
2146
2147 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2148 ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
2149 else
2150 ret = block_write_full_page(page,
2151 ext4_normal_get_block_write,
2152 wbc);
2153
2154 return ret;
2155}
2156
2157/*
2158 * For now just follow the DIO way to estimate the max credits
2159 * needed to write out EXT4_MAX_WRITEBACK_PAGES.
2160 * todo: need to calculate the max credits need for
2161 * extent based files, currently the DIO credits is based on
2162 * indirect-blocks mapping way.
2163 *
2164 * Probably should have a generic way to calculate credits
2165 * for DIO, writepages, and truncate
2166 */
2167#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS
2168#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS
2169
2170static int ext4_da_writepages(struct address_space *mapping,
2171 struct writeback_control *wbc)
2172{
2173 struct inode *inode = mapping->host;
2174 handle_t *handle = NULL;
2175 int needed_blocks;
2176 int ret = 0;
2177 long to_write;
2178 loff_t range_start = 0;
2179
2180 /*
2181 * No pages to write? This is mainly a kludge to avoid starting
2182 * a transaction for special inodes like journal inode on last iput()
2183 * because that could violate lock ordering on umount
2184 */
2185 if (!mapping->nrpages)
2186 return 0;
2187
2188 /*
2189 * Estimate the worse case needed credits to write out
2190 * EXT4_MAX_BUF_BLOCKS pages
2191 */
2192 needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
2193
2194 to_write = wbc->nr_to_write;
2195 if (!wbc->range_cyclic) {
2196 /*
2197 * If range_cyclic is not set force range_cont
2198 * and save the old writeback_index
2199 */
2200 wbc->range_cont = 1;
2201 range_start = wbc->range_start;
2202 }
2203
2204 while (!ret && to_write) {
2205 /* start a new transaction*/
2206 handle = ext4_journal_start(inode, needed_blocks);
2207 if (IS_ERR(handle)) {
2208 ret = PTR_ERR(handle);
2209 goto out_writepages;
2210 }
2211 if (ext4_should_order_data(inode)) {
2212 /*
2213 * With ordered mode we need to add
2214 * the inode to the journal handle
2215 * when we do block allocation.
2216 */
2217 ret = ext4_jbd2_file_inode(handle, inode);
2218 if (ret) {
2219 ext4_journal_stop(handle);
2220 goto out_writepages;
2221 }
2222
2223 }
2224 /*
2225 * set the max dirty pages could be write at a time
2226 * to fit into the reserved transaction credits
2227 */
2228 if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
2229 wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
2230
2231 to_write -= wbc->nr_to_write;
2232 ret = mpage_da_writepages(mapping, wbc,
2233 ext4_da_get_block_write);
2234 ext4_journal_stop(handle);
2235 if (wbc->nr_to_write) {
2236 /*
2237 * There is no more writeout needed
2238 * or we requested for a noblocking writeout
2239 * and we found the device congested
2240 */
2241 to_write += wbc->nr_to_write;
2242 break;
2243 }
2244 wbc->nr_to_write = to_write;
2245 }
2246
2247out_writepages:
2248 wbc->nr_to_write = to_write;
2249 if (range_start)
2250 wbc->range_start = range_start;
2251 return ret;
2252}
2253
2254static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2255 loff_t pos, unsigned len, unsigned flags,
2256 struct page **pagep, void **fsdata)
2257{
2258 int ret, retries = 0;
2259 struct page *page;
2260 pgoff_t index;
2261 unsigned from, to;
2262 struct inode *inode = mapping->host;
2263 handle_t *handle;
2264
2265 index = pos >> PAGE_CACHE_SHIFT;
2266 from = pos & (PAGE_CACHE_SIZE - 1);
2267 to = from + len;
2268
2269retry:
2270 /*
2271 * With delayed allocation, we don't log the i_disksize update
2272 * if there is delayed block allocation. But we still need
2273 * to journalling the i_disksize update if writes to the end
2274 * of file which has an already mapped buffer.
2275 */
2276 handle = ext4_journal_start(inode, 1);
2277 if (IS_ERR(handle)) {
2278 ret = PTR_ERR(handle);
2279 goto out;
2280 }
2281
2282 page = __grab_cache_page(mapping, index);
2283 if (!page)
2284 return -ENOMEM;
2285 *pagep = page;
2286
2287 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
2288 ext4_da_get_block_prep);
2289 if (ret < 0) {
2290 unlock_page(page);
2291 ext4_journal_stop(handle);
2292 page_cache_release(page);
2293 }
2294
2295 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
2296 goto retry;
2297out:
2298 return ret;
2299}
2300
2301/*
2302 * Check if we should update i_disksize
2303 * when write to the end of file but not require block allocation
2304 */
2305static int ext4_da_should_update_i_disksize(struct page *page,
2306 unsigned long offset)
2307{
2308 struct buffer_head *bh;
2309 struct inode *inode = page->mapping->host;
2310 unsigned int idx;
2311 int i;
2312
2313 bh = page_buffers(page);
2314 idx = offset >> inode->i_blkbits;
2315
2316 for (i=0; i < idx; i++)
2317 bh = bh->b_this_page;
2318
2319 if (!buffer_mapped(bh) || (buffer_delay(bh)))
2320 return 0;
2321 return 1;
2322}
2323
2324static int ext4_da_write_end(struct file *file,
2325 struct address_space *mapping,
2326 loff_t pos, unsigned len, unsigned copied,
2327 struct page *page, void *fsdata)
2328{
2329 struct inode *inode = mapping->host;
2330 int ret = 0, ret2;
2331 handle_t *handle = ext4_journal_current_handle();
2332 loff_t new_i_size;
2333 unsigned long start, end;
2334
2335 start = pos & (PAGE_CACHE_SIZE - 1);
2336 end = start + copied -1;
2337
2338 /*
2339 * generic_write_end() will run mark_inode_dirty() if i_size
2340 * changes. So let's piggyback the i_disksize mark_inode_dirty
2341 * into that.
2342 */
2343
2344 new_i_size = pos + copied;
2345 if (new_i_size > EXT4_I(inode)->i_disksize) {
2346 if (ext4_da_should_update_i_disksize(page, end)) {
2347 down_write(&EXT4_I(inode)->i_data_sem);
2348 if (new_i_size > EXT4_I(inode)->i_disksize) {
2349 /*
2350 * Updating i_disksize when extending file
2351 * without needing block allocation
2352 */
2353 if (ext4_should_order_data(inode))
2354 ret = ext4_jbd2_file_inode(handle,
2355 inode);
2356
2357 EXT4_I(inode)->i_disksize = new_i_size;
2358 }
2359 up_write(&EXT4_I(inode)->i_data_sem);
2360 }
2361 }
2362 ret2 = generic_write_end(file, mapping, pos, len, copied,
2363 page, fsdata);
2364 copied = ret2;
2365 if (ret2 < 0)
2366 ret = ret2;
2367 ret2 = ext4_journal_stop(handle);
2368 if (!ret)
2369 ret = ret2;
2370
2371 return ret ? ret : copied;
2372}
2373
2374static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2375{
2376 /*
2377 * Drop reserved blocks
2378 */
2379 BUG_ON(!PageLocked(page));
2380 if (!page_has_buffers(page))
2381 goto out;
2382
2383 ext4_da_page_release_reservation(page, offset);
2384
2385out:
2386 ext4_invalidatepage(page, offset);
2387
2388 return;
2389}
2390
1400 2391
1401/* 2392/*
1402 * bmap() is special. It gets used by applications such as lilo and by 2393 * bmap() is special. It gets used by applications such as lilo and by
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
1418 journal_t *journal; 2409 journal_t *journal;
1419 int err; 2410 int err;
1420 2411
2412 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
2413 test_opt(inode->i_sb, DELALLOC)) {
2414 /*
2415 * With delalloc we want to sync the file
2416 * so that we can make sure we allocate
2417 * blocks for file
2418 */
2419 filemap_write_and_wait(mapping);
2420 }
2421
1421 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2422 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
1422 /* 2423 /*
1423 * This is a REALLY heavyweight approach, but the use of 2424 * This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
1462 return 0; 2463 return 0;
1463} 2464}
1464 2465
1465static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1466{
1467 if (buffer_mapped(bh))
1468 return ext4_journal_dirty_data(handle, bh);
1469 return 0;
1470}
1471
1472/* 2466/*
1473 * Note that we always start a transaction even if we're not journalling 2467 * Note that we don't need to start a transaction unless we're journaling data
1474 * data. This is to preserve ordering: any hole instantiation within 2468 * because we should have holes filled from ext4_page_mkwrite(). We even don't
1475 * __block_write_full_page -> ext4_get_block() should be journalled 2469 * need to file the inode to the transaction's list in ordered mode because if
1476 * along with the data so we don't crash and then get metadata which 2470 * we are writing back data added by write(), the inode is already there and if
1477 * refers to old data. 2471 * we are writing back data modified via mmap(), noone guarantees in which
2472 * transaction the data will hit the disk. In case we are journaling data, we
2473 * cannot start transaction directly because transaction start ranks above page
2474 * lock so we have to do some magic.
1478 * 2475 *
1479 * In all journalling modes block_write_full_page() will start the I/O. 2476 * In all journaling modes block_write_full_page() will start the I/O.
1480 * 2477 *
1481 * Problem: 2478 * Problem:
1482 * 2479 *
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1518 * disastrous. Any write() or metadata operation will sync the fs for 2515 * disastrous. Any write() or metadata operation will sync the fs for
1519 * us. 2516 * us.
1520 * 2517 *
1521 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1522 * we don't need to open a transaction here.
1523 */ 2518 */
1524static int ext4_ordered_writepage(struct page *page, 2519static int __ext4_normal_writepage(struct page *page,
1525 struct writeback_control *wbc) 2520 struct writeback_control *wbc)
1526{ 2521{
1527 struct inode *inode = page->mapping->host; 2522 struct inode *inode = page->mapping->host;
1528 struct buffer_head *page_bufs;
1529 handle_t *handle = NULL;
1530 int ret = 0;
1531 int err;
1532
1533 J_ASSERT(PageLocked(page));
1534
1535 /*
1536 * We give up here if we're reentered, because it might be for a
1537 * different filesystem.
1538 */
1539 if (ext4_journal_current_handle())
1540 goto out_fail;
1541 2523
1542 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2524 if (test_opt(inode->i_sb, NOBH))
2525 return nobh_writepage(page,
2526 ext4_normal_get_block_write, wbc);
2527 else
2528 return block_write_full_page(page,
2529 ext4_normal_get_block_write,
2530 wbc);
2531}
1543 2532
1544 if (IS_ERR(handle)) { 2533static int ext4_normal_writepage(struct page *page,
1545 ret = PTR_ERR(handle); 2534 struct writeback_control *wbc)
1546 goto out_fail; 2535{
1547 } 2536 struct inode *inode = page->mapping->host;
2537 loff_t size = i_size_read(inode);
2538 loff_t len;
1548 2539
1549 if (!page_has_buffers(page)) { 2540 J_ASSERT(PageLocked(page));
1550 create_empty_buffers(page, inode->i_sb->s_blocksize, 2541 if (page->index == size >> PAGE_CACHE_SHIFT)
1551 (1 << BH_Dirty)|(1 << BH_Uptodate)); 2542 len = size & ~PAGE_CACHE_MASK;
2543 else
2544 len = PAGE_CACHE_SIZE;
2545
2546 if (page_has_buffers(page)) {
2547 /* if page has buffers it should all be mapped
2548 * and allocated. If there are not buffers attached
2549 * to the page we know the page is dirty but it lost
2550 * buffers. That means that at some moment in time
2551 * after write_begin() / write_end() has been called
2552 * all buffers have been clean and thus they must have been
2553 * written at least once. So they are all mapped and we can
2554 * happily proceed with mapping them and writing the page.
2555 */
2556 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
2557 ext4_bh_unmapped_or_delay));
1552 } 2558 }
1553 page_bufs = page_buffers(page);
1554 walk_page_buffers(handle, page_bufs, 0,
1555 PAGE_CACHE_SIZE, NULL, bget_one);
1556
1557 ret = block_write_full_page(page, ext4_get_block, wbc);
1558 2559
1559 /* 2560 if (!ext4_journal_current_handle())
1560 * The page can become unlocked at any point now, and 2561 return __ext4_normal_writepage(page, wbc);
1561 * truncate can then come in and change things. So we
1562 * can't touch *page from now on. But *page_bufs is
1563 * safe due to elevated refcount.
1564 */
1565 2562
1566 /*
1567 * And attach them to the current transaction. But only if
1568 * block_write_full_page() succeeded. Otherwise they are unmapped,
1569 * and generally junk.
1570 */
1571 if (ret == 0) {
1572 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1573 NULL, jbd2_journal_dirty_data_fn);
1574 if (!ret)
1575 ret = err;
1576 }
1577 walk_page_buffers(handle, page_bufs, 0,
1578 PAGE_CACHE_SIZE, NULL, bput_one);
1579 err = ext4_journal_stop(handle);
1580 if (!ret)
1581 ret = err;
1582 return ret;
1583
1584out_fail:
1585 redirty_page_for_writepage(wbc, page); 2563 redirty_page_for_writepage(wbc, page);
1586 unlock_page(page); 2564 unlock_page(page);
1587 return ret; 2565 return 0;
1588} 2566}
1589 2567
1590static int ext4_writeback_writepage(struct page *page, 2568static int __ext4_journalled_writepage(struct page *page,
1591 struct writeback_control *wbc) 2569 struct writeback_control *wbc)
1592{ 2570{
1593 struct inode *inode = page->mapping->host; 2571 struct address_space *mapping = page->mapping;
2572 struct inode *inode = mapping->host;
2573 struct buffer_head *page_bufs;
1594 handle_t *handle = NULL; 2574 handle_t *handle = NULL;
1595 int ret = 0; 2575 int ret = 0;
1596 int err; 2576 int err;
1597 2577
1598 if (ext4_journal_current_handle()) 2578 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1599 goto out_fail; 2579 ext4_normal_get_block_write);
2580 if (ret != 0)
2581 goto out_unlock;
2582
2583 page_bufs = page_buffers(page);
2584 walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
2585 bget_one);
2586 /* As soon as we unlock the page, it can go away, but we have
2587 * references to buffers so we are safe */
2588 unlock_page(page);
1600 2589
1601 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2590 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1602 if (IS_ERR(handle)) { 2591 if (IS_ERR(handle)) {
1603 ret = PTR_ERR(handle); 2592 ret = PTR_ERR(handle);
1604 goto out_fail; 2593 goto out;
1605 } 2594 }
1606 2595
1607 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2596 ret = walk_page_buffers(handle, page_bufs, 0,
1608 ret = nobh_writepage(page, ext4_get_block, wbc); 2597 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1609 else
1610 ret = block_write_full_page(page, ext4_get_block, wbc);
1611 2598
2599 err = walk_page_buffers(handle, page_bufs, 0,
2600 PAGE_CACHE_SIZE, NULL, write_end_fn);
2601 if (ret == 0)
2602 ret = err;
1612 err = ext4_journal_stop(handle); 2603 err = ext4_journal_stop(handle);
1613 if (!ret) 2604 if (!ret)
1614 ret = err; 2605 ret = err;
1615 return ret;
1616 2606
1617out_fail: 2607 walk_page_buffers(handle, page_bufs, 0,
1618 redirty_page_for_writepage(wbc, page); 2608 PAGE_CACHE_SIZE, NULL, bput_one);
2609 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
2610 goto out;
2611
2612out_unlock:
1619 unlock_page(page); 2613 unlock_page(page);
2614out:
1620 return ret; 2615 return ret;
1621} 2616}
1622 2617
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page,
1624 struct writeback_control *wbc) 2619 struct writeback_control *wbc)
1625{ 2620{
1626 struct inode *inode = page->mapping->host; 2621 struct inode *inode = page->mapping->host;
1627 handle_t *handle = NULL; 2622 loff_t size = i_size_read(inode);
1628 int ret = 0; 2623 loff_t len;
1629 int err;
1630 2624
1631 if (ext4_journal_current_handle()) 2625 J_ASSERT(PageLocked(page));
1632 goto no_write; 2626 if (page->index == size >> PAGE_CACHE_SHIFT)
2627 len = size & ~PAGE_CACHE_MASK;
2628 else
2629 len = PAGE_CACHE_SIZE;
2630
2631 if (page_has_buffers(page)) {
2632 /* if page has buffers it should all be mapped
2633 * and allocated. If there are not buffers attached
2634 * to the page we know the page is dirty but it lost
2635 * buffers. That means that at some moment in time
2636 * after write_begin() / write_end() has been called
2637 * all buffers have been clean and thus they must have been
2638 * written at least once. So they are all mapped and we can
2639 * happily proceed with mapping them and writing the page.
2640 */
2641 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
2642 ext4_bh_unmapped_or_delay));
2643 }
1633 2644
1634 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 2645 if (ext4_journal_current_handle())
1635 if (IS_ERR(handle)) {
1636 ret = PTR_ERR(handle);
1637 goto no_write; 2646 goto no_write;
1638 }
1639 2647
1640 if (!page_has_buffers(page) || PageChecked(page)) { 2648 if (PageChecked(page)) {
1641 /* 2649 /*
1642 * It's mmapped pagecache. Add buffers and journal it. There 2650 * It's mmapped pagecache. Add buffers and journal it. There
1643 * doesn't seem much point in redirtying the page here. 2651 * doesn't seem much point in redirtying the page here.
1644 */ 2652 */
1645 ClearPageChecked(page); 2653 ClearPageChecked(page);
1646 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 2654 return __ext4_journalled_writepage(page, wbc);
1647 ext4_get_block);
1648 if (ret != 0) {
1649 ext4_journal_stop(handle);
1650 goto out_unlock;
1651 }
1652 ret = walk_page_buffers(handle, page_buffers(page), 0,
1653 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1654
1655 err = walk_page_buffers(handle, page_buffers(page), 0,
1656 PAGE_CACHE_SIZE, NULL, write_end_fn);
1657 if (ret == 0)
1658 ret = err;
1659 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
1660 unlock_page(page);
1661 } else { 2655 } else {
1662 /* 2656 /*
1663 * It may be a page full of checkpoint-mode buffers. We don't 2657 * It may be a page full of checkpoint-mode buffers. We don't
1664 * really know unless we go poke around in the buffer_heads. 2658 * really know unless we go poke around in the buffer_heads.
1665 * But block_write_full_page will do the right thing. 2659 * But block_write_full_page will do the right thing.
1666 */ 2660 */
1667 ret = block_write_full_page(page, ext4_get_block, wbc); 2661 return block_write_full_page(page,
2662 ext4_normal_get_block_write,
2663 wbc);
1668 } 2664 }
1669 err = ext4_journal_stop(handle);
1670 if (!ret)
1671 ret = err;
1672out:
1673 return ret;
1674
1675no_write: 2665no_write:
1676 redirty_page_for_writepage(wbc, page); 2666 redirty_page_for_writepage(wbc, page);
1677out_unlock:
1678 unlock_page(page); 2667 unlock_page(page);
1679 goto out; 2668 return 0;
1680} 2669}
1681 2670
1682static int ext4_readpage(struct file *file, struct page *page) 2671static int ext4_readpage(struct file *file, struct page *page)
@@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
1819static const struct address_space_operations ext4_ordered_aops = { 2808static const struct address_space_operations ext4_ordered_aops = {
1820 .readpage = ext4_readpage, 2809 .readpage = ext4_readpage,
1821 .readpages = ext4_readpages, 2810 .readpages = ext4_readpages,
1822 .writepage = ext4_ordered_writepage, 2811 .writepage = ext4_normal_writepage,
1823 .sync_page = block_sync_page, 2812 .sync_page = block_sync_page,
1824 .write_begin = ext4_write_begin, 2813 .write_begin = ext4_write_begin,
1825 .write_end = ext4_ordered_write_end, 2814 .write_end = ext4_ordered_write_end,
@@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = {
1833static const struct address_space_operations ext4_writeback_aops = { 2822static const struct address_space_operations ext4_writeback_aops = {
1834 .readpage = ext4_readpage, 2823 .readpage = ext4_readpage,
1835 .readpages = ext4_readpages, 2824 .readpages = ext4_readpages,
1836 .writepage = ext4_writeback_writepage, 2825 .writepage = ext4_normal_writepage,
1837 .sync_page = block_sync_page, 2826 .sync_page = block_sync_page,
1838 .write_begin = ext4_write_begin, 2827 .write_begin = ext4_write_begin,
1839 .write_end = ext4_writeback_write_end, 2828 .write_end = ext4_writeback_write_end,
@@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = {
1857 .releasepage = ext4_releasepage, 2846 .releasepage = ext4_releasepage,
1858}; 2847};
1859 2848
2849static const struct address_space_operations ext4_da_aops = {
2850 .readpage = ext4_readpage,
2851 .readpages = ext4_readpages,
2852 .writepage = ext4_da_writepage,
2853 .writepages = ext4_da_writepages,
2854 .sync_page = block_sync_page,
2855 .write_begin = ext4_da_write_begin,
2856 .write_end = ext4_da_write_end,
2857 .bmap = ext4_bmap,
2858 .invalidatepage = ext4_da_invalidatepage,
2859 .releasepage = ext4_releasepage,
2860 .direct_IO = ext4_direct_IO,
2861 .migratepage = buffer_migrate_page,
2862};
2863
1860void ext4_set_aops(struct inode *inode) 2864void ext4_set_aops(struct inode *inode)
1861{ 2865{
1862 if (ext4_should_order_data(inode)) 2866 if (ext4_should_order_data(inode) &&
2867 test_opt(inode->i_sb, DELALLOC))
2868 inode->i_mapping->a_ops = &ext4_da_aops;
2869 else if (ext4_should_order_data(inode))
1863 inode->i_mapping->a_ops = &ext4_ordered_aops; 2870 inode->i_mapping->a_ops = &ext4_ordered_aops;
2871 else if (ext4_should_writeback_data(inode) &&
2872 test_opt(inode->i_sb, DELALLOC))
2873 inode->i_mapping->a_ops = &ext4_da_aops;
1864 else if (ext4_should_writeback_data(inode)) 2874 else if (ext4_should_writeback_data(inode))
1865 inode->i_mapping->a_ops = &ext4_writeback_aops; 2875 inode->i_mapping->a_ops = &ext4_writeback_aops;
1866 else 2876 else
@@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode)
1873 * This required during truncate. We need to physically zero the tail end 2883 * This required during truncate. We need to physically zero the tail end
1874 * of that block so it doesn't yield old data if the file is later grown. 2884 * of that block so it doesn't yield old data if the file is later grown.
1875 */ 2885 */
1876int ext4_block_truncate_page(handle_t *handle, struct page *page, 2886int ext4_block_truncate_page(handle_t *handle,
1877 struct address_space *mapping, loff_t from) 2887 struct address_space *mapping, loff_t from)
1878{ 2888{
1879 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 2889 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
1882 ext4_lblk_t iblock; 2892 ext4_lblk_t iblock;
1883 struct inode *inode = mapping->host; 2893 struct inode *inode = mapping->host;
1884 struct buffer_head *bh; 2894 struct buffer_head *bh;
2895 struct page *page;
1885 int err = 0; 2896 int err = 0;
1886 2897
2898 page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
2899 if (!page)
2900 return -EINVAL;
2901
1887 blocksize = inode->i_sb->s_blocksize; 2902 blocksize = inode->i_sb->s_blocksize;
1888 length = blocksize - (offset & (blocksize - 1)); 2903 length = blocksize - (offset & (blocksize - 1));
1889 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 2904 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
1956 err = ext4_journal_dirty_metadata(handle, bh); 2971 err = ext4_journal_dirty_metadata(handle, bh);
1957 } else { 2972 } else {
1958 if (ext4_should_order_data(inode)) 2973 if (ext4_should_order_data(inode))
1959 err = ext4_journal_dirty_data(handle, bh); 2974 err = ext4_jbd2_file_inode(handle, inode);
1960 mark_buffer_dirty(bh); 2975 mark_buffer_dirty(bh);
1961 } 2976 }
1962 2977
@@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
2179 3194
2180 if (this_bh) { 3195 if (this_bh) {
2181 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); 3196 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
2182 ext4_journal_dirty_metadata(handle, this_bh); 3197
3198 /*
3199 * The buffer head should have an attached journal head at this
3200 * point. However, if the data is corrupted and an indirect
3201 * block pointed to itself, it would have been detached when
3202 * the block was cleared. Check for this instead of OOPSing.
3203 */
3204 if (bh2jh(this_bh))
3205 ext4_journal_dirty_metadata(handle, this_bh);
3206 else
3207 ext4_error(inode->i_sb, __func__,
3208 "circular indirect block detected, "
3209 "inode=%lu, block=%llu",
3210 inode->i_ino,
3211 (unsigned long long) this_bh->b_blocknr);
2183 } 3212 }
2184} 3213}
2185 3214
@@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
2305 } 3334 }
2306} 3335}
2307 3336
3337int ext4_can_truncate(struct inode *inode)
3338{
3339 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
3340 return 0;
3341 if (S_ISREG(inode->i_mode))
3342 return 1;
3343 if (S_ISDIR(inode->i_mode))
3344 return 1;
3345 if (S_ISLNK(inode->i_mode))
3346 return !ext4_inode_is_fast_symlink(inode);
3347 return 0;
3348}
3349
2308/* 3350/*
2309 * ext4_truncate() 3351 * ext4_truncate()
2310 * 3352 *
@@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode)
2347 int n; 3389 int n;
2348 ext4_lblk_t last_block; 3390 ext4_lblk_t last_block;
2349 unsigned blocksize = inode->i_sb->s_blocksize; 3391 unsigned blocksize = inode->i_sb->s_blocksize;
2350 struct page *page;
2351 3392
2352 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 3393 if (!ext4_can_truncate(inode))
2353 S_ISLNK(inode->i_mode)))
2354 return;
2355 if (ext4_inode_is_fast_symlink(inode))
2356 return;
2357 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2358 return; 3394 return;
2359 3395
2360 /*
2361 * We have to lock the EOF page here, because lock_page() nests
2362 * outside jbd2_journal_start().
2363 */
2364 if ((inode->i_size & (blocksize - 1)) == 0) {
2365 /* Block boundary? Nothing to do */
2366 page = NULL;
2367 } else {
2368 page = grab_cache_page(mapping,
2369 inode->i_size >> PAGE_CACHE_SHIFT);
2370 if (!page)
2371 return;
2372 }
2373
2374 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3396 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
2375 ext4_ext_truncate(inode, page); 3397 ext4_ext_truncate(inode);
2376 return; 3398 return;
2377 } 3399 }
2378 3400
2379 handle = start_transaction(inode); 3401 handle = start_transaction(inode);
2380 if (IS_ERR(handle)) { 3402 if (IS_ERR(handle))
2381 if (page) {
2382 clear_highpage(page);
2383 flush_dcache_page(page);
2384 unlock_page(page);
2385 page_cache_release(page);
2386 }
2387 return; /* AKPM: return what? */ 3403 return; /* AKPM: return what? */
2388 }
2389 3404
2390 last_block = (inode->i_size + blocksize-1) 3405 last_block = (inode->i_size + blocksize-1)
2391 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 3406 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
2392 3407
2393 if (page) 3408 if (inode->i_size & (blocksize - 1))
2394 ext4_block_truncate_page(handle, page, mapping, inode->i_size); 3409 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
3410 goto out_stop;
2395 3411
2396 n = ext4_block_to_path(inode, last_block, offsets, NULL); 3412 n = ext4_block_to_path(inode, last_block, offsets, NULL);
2397 if (n == 0) 3413 if (n == 0)
@@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode)
2410 goto out_stop; 3426 goto out_stop;
2411 3427
2412 /* 3428 /*
3429 * From here we block out all ext4_get_block() callers who want to
3430 * modify the block allocation tree.
3431 */
3432 down_write(&ei->i_data_sem);
3433 /*
2413 * The orphan list entry will now protect us from any crash which 3434 * The orphan list entry will now protect us from any crash which
2414 * occurs before the truncate completes, so it is now safe to propagate 3435 * occurs before the truncate completes, so it is now safe to propagate
2415 * the new, shorter inode size (held for now in i_size) into the 3436 * the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
2418 */ 3439 */
2419 ei->i_disksize = inode->i_size; 3440 ei->i_disksize = inode->i_size;
2420 3441
2421 /*
2422 * From here we block out all ext4_get_block() callers who want to
2423 * modify the block allocation tree.
2424 */
2425 down_write(&ei->i_data_sem);
2426
2427 if (n == 1) { /* direct blocks */ 3442 if (n == 1) { /* direct blocks */
2428 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 3443 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
2429 i_data + EXT4_NDIR_BLOCKS); 3444 i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait)
3107 * be freed, so we have a strong guarantee that no future commit will 4122 * be freed, so we have a strong guarantee that no future commit will
3108 * leave these blocks visible to the user.) 4123 * leave these blocks visible to the user.)
3109 * 4124 *
3110 * Called with inode->sem down. 4125 * Another thing we have to assure is that if we are in ordered mode
4126 * and inode is still attached to the committing transaction, we must
4127 * we start writeout of all the dirty pages which are being truncated.
4128 * This way we are sure that all the data written in the previous
4129 * transaction are already on disk (truncate waits for pages under
4130 * writeback).
4131 *
4132 * Called with inode->i_mutex down.
3111 */ 4133 */
3112int ext4_setattr(struct dentry *dentry, struct iattr *attr) 4134int ext4_setattr(struct dentry *dentry, struct iattr *attr)
3113{ 4135{
@@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
3173 if (!error) 4195 if (!error)
3174 error = rc; 4196 error = rc;
3175 ext4_journal_stop(handle); 4197 ext4_journal_stop(handle);
4198
4199 if (ext4_should_order_data(inode)) {
4200 error = ext4_begin_ordered_truncate(inode,
4201 attr->ia_size);
4202 if (error) {
4203 /* Do as much error cleanup as possible */
4204 handle = ext4_journal_start(inode, 3);
4205 if (IS_ERR(handle)) {
4206 ext4_orphan_del(NULL, inode);
4207 goto err_out;
4208 }
4209 ext4_orphan_del(handle, inode);
4210 ext4_journal_stop(handle);
4211 goto err_out;
4212 }
4213 }
3176 } 4214 }
3177 4215
3178 rc = inode_setattr(inode, attr); 4216 rc = inode_setattr(inode, attr);
@@ -3193,6 +4231,32 @@ err_out:
3193 return error; 4231 return error;
3194} 4232}
3195 4233
4234int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4235 struct kstat *stat)
4236{
4237 struct inode *inode;
4238 unsigned long delalloc_blocks;
4239
4240 inode = dentry->d_inode;
4241 generic_fillattr(inode, stat);
4242
4243 /*
4244 * We can't update i_blocks if the block allocation is delayed
4245 * otherwise in the case of system crash before the real block
4246 * allocation is done, we will have i_blocks inconsistent with
4247 * on-disk file blocks.
4248 * We always keep i_blocks updated together with real
4249 * allocation. But to not confuse with user, stat
4250 * will return the blocks that include the delayed allocation
4251 * blocks for this file.
4252 */
4253 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
4254 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
4255 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
4256
4257 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
4258 return 0;
4259}
3196 4260
3197/* 4261/*
3198 * How many blocks doth make a writepage()? 4262 * How many blocks doth make a writepage()?
@@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
3506 4570
3507 return err; 4571 return err;
3508} 4572}
4573
4574static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
4575{
4576 return !buffer_mapped(bh);
4577}
4578
4579int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4580{
4581 loff_t size;
4582 unsigned long len;
4583 int ret = -EINVAL;
4584 struct file *file = vma->vm_file;
4585 struct inode *inode = file->f_path.dentry->d_inode;
4586 struct address_space *mapping = inode->i_mapping;
4587
4588 /*
4589 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
4590 * get i_mutex because we are already holding mmap_sem.
4591 */
4592 down_read(&inode->i_alloc_sem);
4593 size = i_size_read(inode);
4594 if (page->mapping != mapping || size <= page_offset(page)
4595 || !PageUptodate(page)) {
4596 /* page got truncated from under us? */
4597 goto out_unlock;
4598 }
4599 ret = 0;
4600 if (PageMappedToDisk(page))
4601 goto out_unlock;
4602
4603 if (page->index == size >> PAGE_CACHE_SHIFT)
4604 len = size & ~PAGE_CACHE_MASK;
4605 else
4606 len = PAGE_CACHE_SIZE;
4607
4608 if (page_has_buffers(page)) {
4609 /* return if we have all the buffers mapped */
4610 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
4611 ext4_bh_unmapped))
4612 goto out_unlock;
4613 }
4614 /*
4615 * OK, we need to fill the hole... Do write_begin write_end
4616 * to do block allocation/reservation.We are not holding
4617 * inode.i__mutex here. That allow * parallel write_begin,
4618 * write_end call. lock_page prevent this from happening
4619 * on the same page though
4620 */
4621 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
4622 len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
4623 if (ret < 0)
4624 goto out_unlock;
4625 ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
4626 len, len, page, NULL);
4627 if (ret < 0)
4628 goto out_unlock;
4629 ret = 0;
4630out_unlock:
4631 up_read(&inode->i_alloc_sem);
4632 return ret;
4633}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c9900aade150..8d141a25bbee 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
381 381
382static inline int mb_find_next_zero_bit(void *addr, int max, int start) 382static inline int mb_find_next_zero_bit(void *addr, int max, int start)
383{ 383{
384 int fix = 0; 384 int fix = 0, ret, tmpmax;
385 addr = mb_correct_addr_and_bit(&fix, addr); 385 addr = mb_correct_addr_and_bit(&fix, addr);
386 max += fix; 386 tmpmax = max + fix;
387 start += fix; 387 start += fix;
388 388
389 return ext4_find_next_zero_bit(addr, max, start) - fix; 389 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
390 if (ret > max)
391 return max;
392 return ret;
390} 393}
391 394
392static inline int mb_find_next_bit(void *addr, int max, int start) 395static inline int mb_find_next_bit(void *addr, int max, int start)
393{ 396{
394 int fix = 0; 397 int fix = 0, ret, tmpmax;
395 addr = mb_correct_addr_and_bit(&fix, addr); 398 addr = mb_correct_addr_and_bit(&fix, addr);
396 max += fix; 399 tmpmax = max + fix;
397 start += fix; 400 start += fix;
398 401
399 return ext4_find_next_bit(addr, max, start) - fix; 402 ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
403 if (ret > max)
404 return max;
405 return ret;
400} 406}
401 407
402static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) 408static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
803 if (!buffer_uptodate(bh[i])) 809 if (!buffer_uptodate(bh[i]))
804 goto out; 810 goto out;
805 811
812 err = 0;
806 first_block = page->index * blocks_per_page; 813 first_block = page->index * blocks_per_page;
807 for (i = 0; i < blocks_per_page; i++) { 814 for (i = 0; i < blocks_per_page; i++) {
808 int group; 815 int group;
@@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
883 int pnum; 890 int pnum;
884 int poff; 891 int poff;
885 struct page *page; 892 struct page *page;
893 int ret;
886 894
887 mb_debug("load group %lu\n", group); 895 mb_debug("load group %lu\n", group);
888 896
@@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
914 if (page) { 922 if (page) {
915 BUG_ON(page->mapping != inode->i_mapping); 923 BUG_ON(page->mapping != inode->i_mapping);
916 if (!PageUptodate(page)) { 924 if (!PageUptodate(page)) {
917 ext4_mb_init_cache(page, NULL); 925 ret = ext4_mb_init_cache(page, NULL);
926 if (ret) {
927 unlock_page(page);
928 goto err;
929 }
918 mb_cmp_bitmaps(e4b, page_address(page) + 930 mb_cmp_bitmaps(e4b, page_address(page) +
919 (poff * sb->s_blocksize)); 931 (poff * sb->s_blocksize));
920 } 932 }
921 unlock_page(page); 933 unlock_page(page);
922 } 934 }
923 } 935 }
924 if (page == NULL || !PageUptodate(page)) 936 if (page == NULL || !PageUptodate(page)) {
937 ret = -EIO;
925 goto err; 938 goto err;
939 }
926 e4b->bd_bitmap_page = page; 940 e4b->bd_bitmap_page = page;
927 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 941 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
928 mark_page_accessed(page); 942 mark_page_accessed(page);
@@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
938 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 952 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
939 if (page) { 953 if (page) {
940 BUG_ON(page->mapping != inode->i_mapping); 954 BUG_ON(page->mapping != inode->i_mapping);
941 if (!PageUptodate(page)) 955 if (!PageUptodate(page)) {
942 ext4_mb_init_cache(page, e4b->bd_bitmap); 956 ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
943 957 if (ret) {
958 unlock_page(page);
959 goto err;
960 }
961 }
944 unlock_page(page); 962 unlock_page(page);
945 } 963 }
946 } 964 }
947 if (page == NULL || !PageUptodate(page)) 965 if (page == NULL || !PageUptodate(page)) {
966 ret = -EIO;
948 goto err; 967 goto err;
968 }
949 e4b->bd_buddy_page = page; 969 e4b->bd_buddy_page = page;
950 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 970 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
951 mark_page_accessed(page); 971 mark_page_accessed(page);
@@ -962,7 +982,7 @@ err:
962 page_cache_release(e4b->bd_buddy_page); 982 page_cache_release(e4b->bd_buddy_page);
963 e4b->bd_buddy = NULL; 983 e4b->bd_buddy = NULL;
964 e4b->bd_bitmap = NULL; 984 e4b->bd_bitmap = NULL;
965 return -EIO; 985 return ret;
966} 986}
967 987
968static void ext4_mb_release_desc(struct ext4_buddy *e4b) 988static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1031 } 1051 }
1032} 1052}
1033 1053
1034static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1054static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1035 int first, int count) 1055 int first, int count)
1036{ 1056{
1037 int block = 0; 1057 int block = 0;
@@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1071 blocknr += block; 1091 blocknr += block;
1072 blocknr += 1092 blocknr +=
1073 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 1093 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1074 1094 ext4_unlock_group(sb, e4b->bd_group);
1075 ext4_error(sb, __func__, "double-free of inode" 1095 ext4_error(sb, __func__, "double-free of inode"
1076 " %lu's block %llu(bit %u in group %lu)\n", 1096 " %lu's block %llu(bit %u in group %lu)\n",
1077 inode ? inode->i_ino : 0, blocknr, block, 1097 inode ? inode->i_ino : 0, blocknr, block,
1078 e4b->bd_group); 1098 e4b->bd_group);
1099 ext4_lock_group(sb, e4b->bd_group);
1079 } 1100 }
1080 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1101 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1081 e4b->bd_info->bb_counters[order]++; 1102 e4b->bd_info->bb_counters[order]++;
@@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1113 } while (1); 1134 } while (1);
1114 } 1135 }
1115 mb_check_buddy(e4b); 1136 mb_check_buddy(e4b);
1116
1117 return 0;
1118} 1137}
1119 1138
1120static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, 1139static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1730 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 1749 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
1731 spin_unlock(&sbi->s_md_lock); 1750 spin_unlock(&sbi->s_md_lock);
1732 } 1751 }
1733
1734 /* searching for the right group start from the goal value specified */
1735 group = ac->ac_g_ex.fe_group;
1736
1737 /* Let's just scan groups to find more-less suitable blocks */ 1752 /* Let's just scan groups to find more-less suitable blocks */
1738 cr = ac->ac_2order ? 0 : 1; 1753 cr = ac->ac_2order ? 0 : 1;
1739 /* 1754 /*
@@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1743repeat: 1758repeat:
1744 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 1759 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
1745 ac->ac_criteria = cr; 1760 ac->ac_criteria = cr;
1761 /*
1762 * searching for the right group start
1763 * from the goal value specified
1764 */
1765 group = ac->ac_g_ex.fe_group;
1766
1746 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { 1767 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
1747 struct ext4_group_info *grp; 1768 struct ext4_group_info *grp;
1748 struct ext4_group_desc *desc; 1769 struct ext4_group_desc *desc;
@@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
1963 int rc; 1984 int rc;
1964 int size; 1985 int size;
1965 1986
1987 if (unlikely(sbi->s_mb_history == NULL))
1988 return -ENOMEM;
1966 s = kmalloc(sizeof(*s), GFP_KERNEL); 1989 s = kmalloc(sizeof(*s), GFP_KERNEL);
1967 if (s == NULL) 1990 if (s == NULL)
1968 return -ENOMEM; 1991 return -ENOMEM;
@@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb)
2165 sbi->s_mb_history_cur = 0; 2188 sbi->s_mb_history_cur = 0;
2166 spin_lock_init(&sbi->s_mb_history_lock); 2189 spin_lock_init(&sbi->s_mb_history_lock);
2167 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); 2190 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2168 sbi->s_mb_history = kmalloc(i, GFP_KERNEL); 2191 sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
2169 if (likely(sbi->s_mb_history != NULL))
2170 memset(sbi->s_mb_history, 0, i);
2171 /* if we can't allocate history, then we simple won't use it */ 2192 /* if we can't allocate history, then we simple won't use it */
2172} 2193}
2173 2194
@@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
2215#define ext4_mb_history_init(sb) 2236#define ext4_mb_history_init(sb)
2216#endif 2237#endif
2217 2238
2239
2240/* Create and initialize ext4_group_info data for the given group. */
2241int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2242 struct ext4_group_desc *desc)
2243{
2244 int i, len;
2245 int metalen = 0;
2246 struct ext4_sb_info *sbi = EXT4_SB(sb);
2247 struct ext4_group_info **meta_group_info;
2248
2249 /*
2250 * First check if this group is the first of a reserved block.
2251 * If it's true, we have to allocate a new table of pointers
2252 * to ext4_group_info structures
2253 */
2254 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2255 metalen = sizeof(*meta_group_info) <<
2256 EXT4_DESC_PER_BLOCK_BITS(sb);
2257 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2258 if (meta_group_info == NULL) {
2259 printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2260 "buddy group\n");
2261 goto exit_meta_group_info;
2262 }
2263 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
2264 meta_group_info;
2265 }
2266
2267 /*
2268 * calculate needed size. if change bb_counters size,
2269 * don't forget about ext4_mb_generate_buddy()
2270 */
2271 len = offsetof(typeof(**meta_group_info),
2272 bb_counters[sb->s_blocksize_bits + 2]);
2273
2274 meta_group_info =
2275 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2276 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2277
2278 meta_group_info[i] = kzalloc(len, GFP_KERNEL);
2279 if (meta_group_info[i] == NULL) {
2280 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2281 goto exit_group_info;
2282 }
2283 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2284 &(meta_group_info[i]->bb_state));
2285
2286 /*
2287 * initialize bb_free to be able to skip
2288 * empty groups without initialization
2289 */
2290 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2291 meta_group_info[i]->bb_free =
2292 ext4_free_blocks_after_init(sb, group, desc);
2293 } else {
2294 meta_group_info[i]->bb_free =
2295 le16_to_cpu(desc->bg_free_blocks_count);
2296 }
2297
2298 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2299
2300#ifdef DOUBLE_CHECK
2301 {
2302 struct buffer_head *bh;
2303 meta_group_info[i]->bb_bitmap =
2304 kmalloc(sb->s_blocksize, GFP_KERNEL);
2305 BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2306 bh = ext4_read_block_bitmap(sb, group);
2307 BUG_ON(bh == NULL);
2308 memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2309 sb->s_blocksize);
2310 put_bh(bh);
2311 }
2312#endif
2313
2314 return 0;
2315
2316exit_group_info:
2317 /* If a meta_group_info table has been allocated, release it now */
2318 if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
2319 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
2320exit_meta_group_info:
2321 return -ENOMEM;
2322} /* ext4_mb_add_groupinfo */
2323
2324/*
2325 * Add a group to the existing groups.
2326 * This function is used for online resize
2327 */
2328int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
2329 struct ext4_group_desc *desc)
2330{
2331 struct ext4_sb_info *sbi = EXT4_SB(sb);
2332 struct inode *inode = sbi->s_buddy_cache;
2333 int blocks_per_page;
2334 int block;
2335 int pnum;
2336 struct page *page;
2337 int err;
2338
2339 /* Add group based on group descriptor*/
2340 err = ext4_mb_add_groupinfo(sb, group, desc);
2341 if (err)
2342 return err;
2343
2344 /*
2345 * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
2346 * datas) are set not up to date so that they will be re-initilaized
2347 * during the next call to ext4_mb_load_buddy
2348 */
2349
2350 /* Set buddy page as not up to date */
2351 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2352 block = group * 2;
2353 pnum = block / blocks_per_page;
2354 page = find_get_page(inode->i_mapping, pnum);
2355 if (page != NULL) {
2356 ClearPageUptodate(page);
2357 page_cache_release(page);
2358 }
2359
2360 /* Set bitmap page as not up to date */
2361 block++;
2362 pnum = block / blocks_per_page;
2363 page = find_get_page(inode->i_mapping, pnum);
2364 if (page != NULL) {
2365 ClearPageUptodate(page);
2366 page_cache_release(page);
2367 }
2368
2369 return 0;
2370}
2371
2372/*
2373 * Update an existing group.
2374 * This function is used for online resize
2375 */
2376void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
2377{
2378 grp->bb_free += add;
2379}
2380
2218static int ext4_mb_init_backend(struct super_block *sb) 2381static int ext4_mb_init_backend(struct super_block *sb)
2219{ 2382{
2220 ext4_group_t i; 2383 ext4_group_t i;
2221 int j, len, metalen; 2384 int metalen;
2222 struct ext4_sb_info *sbi = EXT4_SB(sb); 2385 struct ext4_sb_info *sbi = EXT4_SB(sb);
2223 int num_meta_group_infos = 2386 struct ext4_super_block *es = sbi->s_es;
2224 (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >> 2387 int num_meta_group_infos;
2225 EXT4_DESC_PER_BLOCK_BITS(sb); 2388 int num_meta_group_infos_max;
2389 int array_size;
2226 struct ext4_group_info **meta_group_info; 2390 struct ext4_group_info **meta_group_info;
2391 struct ext4_group_desc *desc;
2392
2393 /* This is the number of blocks used by GDT */
2394 num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
2395 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
2396
2397 /*
2398 * This is the total number of blocks used by GDT including
2399 * the number of reserved blocks for GDT.
2400 * The s_group_info array is allocated with this value
2401 * to allow a clean online resize without a complex
2402 * manipulation of pointer.
2403 * The drawback is the unused memory when no resize
2404 * occurs but it's very low in terms of pages
2405 * (see comments below)
2406 * Need to handle this properly when META_BG resizing is allowed
2407 */
2408 num_meta_group_infos_max = num_meta_group_infos +
2409 le16_to_cpu(es->s_reserved_gdt_blocks);
2227 2410
2411 /*
2412 * array_size is the size of s_group_info array. We round it
2413 * to the next power of two because this approximation is done
2414 * internally by kmalloc so we can have some more memory
2415 * for free here (e.g. may be used for META_BG resize).
2416 */
2417 array_size = 1;
2418 while (array_size < sizeof(*sbi->s_group_info) *
2419 num_meta_group_infos_max)
2420 array_size = array_size << 1;
2228 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2421 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2229 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2422 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2230 * So a two level scheme suffices for now. */ 2423 * So a two level scheme suffices for now. */
2231 sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * 2424 sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
2232 num_meta_group_infos, GFP_KERNEL);
2233 if (sbi->s_group_info == NULL) { 2425 if (sbi->s_group_info == NULL) {
2234 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); 2426 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
2235 return -ENOMEM; 2427 return -ENOMEM;
@@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
2256 sbi->s_group_info[i] = meta_group_info; 2448 sbi->s_group_info[i] = meta_group_info;
2257 } 2449 }
2258 2450
2259 /*
2260 * calculate needed size. if change bb_counters size,
2261 * don't forget about ext4_mb_generate_buddy()
2262 */
2263 len = sizeof(struct ext4_group_info);
2264 len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
2265 for (i = 0; i < sbi->s_groups_count; i++) { 2451 for (i = 0; i < sbi->s_groups_count; i++) {
2266 struct ext4_group_desc *desc;
2267
2268 meta_group_info =
2269 sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2270 j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
2271
2272 meta_group_info[j] = kzalloc(len, GFP_KERNEL);
2273 if (meta_group_info[j] == NULL) {
2274 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2275 goto err_freebuddy;
2276 }
2277 desc = ext4_get_group_desc(sb, i, NULL); 2452 desc = ext4_get_group_desc(sb, i, NULL);
2278 if (desc == NULL) { 2453 if (desc == NULL) {
2279 printk(KERN_ERR 2454 printk(KERN_ERR
2280 "EXT4-fs: can't read descriptor %lu\n", i); 2455 "EXT4-fs: can't read descriptor %lu\n", i);
2281 i++;
2282 goto err_freebuddy; 2456 goto err_freebuddy;
2283 } 2457 }
2284 memset(meta_group_info[j], 0, len); 2458 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2285 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2459 goto err_freebuddy;
2286 &(meta_group_info[j]->bb_state));
2287
2288 /*
2289 * initialize bb_free to be able to skip
2290 * empty groups without initialization
2291 */
2292 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2293 meta_group_info[j]->bb_free =
2294 ext4_free_blocks_after_init(sb, i, desc);
2295 } else {
2296 meta_group_info[j]->bb_free =
2297 le16_to_cpu(desc->bg_free_blocks_count);
2298 }
2299
2300 INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
2301
2302#ifdef DOUBLE_CHECK
2303 {
2304 struct buffer_head *bh;
2305 meta_group_info[j]->bb_bitmap =
2306 kmalloc(sb->s_blocksize, GFP_KERNEL);
2307 BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
2308 bh = read_block_bitmap(sb, i);
2309 BUG_ON(bh == NULL);
2310 memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
2311 sb->s_blocksize);
2312 put_bh(bh);
2313 }
2314#endif
2315
2316 } 2460 }
2317 2461
2318 return 0; 2462 return 0;
@@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2336 unsigned i; 2480 unsigned i;
2337 unsigned offset; 2481 unsigned offset;
2338 unsigned max; 2482 unsigned max;
2483 int ret;
2339 2484
2340 if (!test_opt(sb, MBALLOC)) 2485 if (!test_opt(sb, MBALLOC))
2341 return 0; 2486 return 0;
@@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2370 } while (i <= sb->s_blocksize_bits + 1); 2515 } while (i <= sb->s_blocksize_bits + 1);
2371 2516
2372 /* init file for buddy data */ 2517 /* init file for buddy data */
2373 i = ext4_mb_init_backend(sb); 2518 ret = ext4_mb_init_backend(sb);
2374 if (i) { 2519 if (ret != 0) {
2375 clear_opt(sbi->s_mount_opt, MBALLOC); 2520 clear_opt(sbi->s_mount_opt, MBALLOC);
2376 kfree(sbi->s_mb_offsets); 2521 kfree(sbi->s_mb_offsets);
2377 kfree(sbi->s_mb_maxs); 2522 kfree(sbi->s_mb_maxs);
2378 return i; 2523 return ret;
2379 } 2524 }
2380 2525
2381 spin_lock_init(&sbi->s_md_lock); 2526 spin_lock_init(&sbi->s_md_lock);
@@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
2548 ext4_lock_group(sb, md->group); 2693 ext4_lock_group(sb, md->group);
2549 for (i = 0; i < md->num; i++) { 2694 for (i = 0; i < md->num; i++) {
2550 mb_debug(" %u", md->blocks[i]); 2695 mb_debug(" %u", md->blocks[i]);
2551 err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1); 2696 mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
2552 BUG_ON(err != 0);
2553 } 2697 }
2554 mb_debug("\n"); 2698 mb_debug("\n");
2555 ext4_unlock_group(sb, md->group); 2699 ext4_unlock_group(sb, md->group);
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
2575 2719
2576 2720
2577 2721
2578#define MB_PROC_VALUE_READ(name) \ 2722#define MB_PROC_FOPS(name) \
2579static int ext4_mb_read_##name(char *page, char **start, \ 2723static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
2580 off_t off, int count, int *eof, void *data) \
2581{ \ 2724{ \
2582 struct ext4_sb_info *sbi = data; \ 2725 struct ext4_sb_info *sbi = m->private; \
2583 int len; \ 2726 \
2584 *eof = 1; \ 2727 seq_printf(m, "%ld\n", sbi->s_mb_##name); \
2585 if (off != 0) \ 2728 return 0; \
2586 return 0; \ 2729} \
2587 len = sprintf(page, "%ld\n", sbi->s_mb_##name); \ 2730 \
2588 *start = page; \ 2731static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
2589 return len; \ 2732{ \
2590} 2733 return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
2591 2734} \
2592#define MB_PROC_VALUE_WRITE(name) \ 2735 \
2593static int ext4_mb_write_##name(struct file *file, \ 2736static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
2594 const char __user *buf, unsigned long cnt, void *data) \ 2737 const char __user *buf, size_t cnt, loff_t *ppos) \
2595{ \ 2738{ \
2596 struct ext4_sb_info *sbi = data; \ 2739 struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
2597 char str[32]; \ 2740 char str[32]; \
2598 long value; \ 2741 long value; \
2599 if (cnt >= sizeof(str)) \ 2742 if (cnt >= sizeof(str)) \
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file, \
2605 return -ERANGE; \ 2748 return -ERANGE; \
2606 sbi->s_mb_##name = value; \ 2749 sbi->s_mb_##name = value; \
2607 return cnt; \ 2750 return cnt; \
2608} 2751} \
2752 \
2753static const struct file_operations ext4_mb_##name##_proc_fops = { \
2754 .owner = THIS_MODULE, \
2755 .open = ext4_mb_##name##_proc_open, \
2756 .read = seq_read, \
2757 .llseek = seq_lseek, \
2758 .release = single_release, \
2759 .write = ext4_mb_##name##_proc_write, \
2760};
2609 2761
2610MB_PROC_VALUE_READ(stats); 2762MB_PROC_FOPS(stats);
2611MB_PROC_VALUE_WRITE(stats); 2763MB_PROC_FOPS(max_to_scan);
2612MB_PROC_VALUE_READ(max_to_scan); 2764MB_PROC_FOPS(min_to_scan);
2613MB_PROC_VALUE_WRITE(max_to_scan); 2765MB_PROC_FOPS(order2_reqs);
2614MB_PROC_VALUE_READ(min_to_scan); 2766MB_PROC_FOPS(stream_request);
2615MB_PROC_VALUE_WRITE(min_to_scan); 2767MB_PROC_FOPS(group_prealloc);
2616MB_PROC_VALUE_READ(order2_reqs);
2617MB_PROC_VALUE_WRITE(order2_reqs);
2618MB_PROC_VALUE_READ(stream_request);
2619MB_PROC_VALUE_WRITE(stream_request);
2620MB_PROC_VALUE_READ(group_prealloc);
2621MB_PROC_VALUE_WRITE(group_prealloc);
2622 2768
2623#define MB_PROC_HANDLER(name, var) \ 2769#define MB_PROC_HANDLER(name, var) \
2624do { \ 2770do { \
2625 proc = create_proc_entry(name, mode, sbi->s_mb_proc); \ 2771 proc = proc_create_data(name, mode, sbi->s_mb_proc, \
2772 &ext4_mb_##var##_proc_fops, sbi); \
2626 if (proc == NULL) { \ 2773 if (proc == NULL) { \
2627 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \ 2774 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
2628 goto err_out; \ 2775 goto err_out; \
2629 } \ 2776 } \
2630 proc->data = sbi; \
2631 proc->read_proc = ext4_mb_read_##var ; \
2632 proc->write_proc = ext4_mb_write_##var; \
2633} while (0) 2777} while (0)
2634 2778
2635static int ext4_mb_init_per_dev_proc(struct super_block *sb) 2779static int ext4_mb_init_per_dev_proc(struct super_block *sb)
@@ -2639,6 +2783,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2639 struct proc_dir_entry *proc; 2783 struct proc_dir_entry *proc;
2640 char devname[64]; 2784 char devname[64];
2641 2785
2786 if (proc_root_ext4 == NULL) {
2787 sbi->s_mb_proc = NULL;
2788 return -EINVAL;
2789 }
2642 bdevname(sb->s_bdev, devname); 2790 bdevname(sb->s_bdev, devname);
2643 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4); 2791 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
2644 2792
@@ -2747,7 +2895,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2747 2895
2748 2896
2749 err = -EIO; 2897 err = -EIO;
2750 bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group); 2898 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
2751 if (!bitmap_bh) 2899 if (!bitmap_bh)
2752 goto out_err; 2900 goto out_err;
2753 2901
@@ -2816,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2816 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); 2964 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
2817 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 2965 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2818 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 2966 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2819 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 2967
2968 /*
2969 * free blocks account has already be reduced/reserved
2970 * at write_begin() time for delayed allocation
2971 * do not double accounting
2972 */
2973 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2974 percpu_counter_sub(&sbi->s_freeblocks_counter,
2975 ac->ac_b_ex.fe_len);
2976
2977 if (sbi->s_log_groups_per_flex) {
2978 ext4_group_t flex_group = ext4_flex_group(sbi,
2979 ac->ac_b_ex.fe_group);
2980 spin_lock(sb_bgl_lock(sbi, flex_group));
2981 sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
2982 spin_unlock(sb_bgl_lock(sbi, flex_group));
2983 }
2820 2984
2821 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 2985 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
2822 if (err) 2986 if (err)
@@ -3473,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3473 if (bit >= end) 3637 if (bit >= end)
3474 break; 3638 break;
3475 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3639 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3476 if (next > end)
3477 next = end;
3478 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3640 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
3479 le32_to_cpu(sbi->s_es->s_first_data_block); 3641 le32_to_cpu(sbi->s_es->s_first_data_block);
3480 mb_debug(" free preallocated %u/%u in group %u\n", 3642 mb_debug(" free preallocated %u/%u in group %u\n",
@@ -3569,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3569 if (list_empty(&grp->bb_prealloc_list)) 3731 if (list_empty(&grp->bb_prealloc_list))
3570 return 0; 3732 return 0;
3571 3733
3572 bitmap_bh = read_block_bitmap(sb, group); 3734 bitmap_bh = ext4_read_block_bitmap(sb, group);
3573 if (bitmap_bh == NULL) { 3735 if (bitmap_bh == NULL) {
3574 /* error handling here */ 3736 /* error handling here */
3575 ext4_mb_release_desc(&e4b); 3737 ext4_mb_release_desc(&e4b);
@@ -3743,7 +3905,7 @@ repeat:
3743 err = ext4_mb_load_buddy(sb, group, &e4b); 3905 err = ext4_mb_load_buddy(sb, group, &e4b);
3744 BUG_ON(err != 0); /* error handling here */ 3906 BUG_ON(err != 0); /* error handling here */
3745 3907
3746 bitmap_bh = read_block_bitmap(sb, group); 3908 bitmap_bh = ext4_read_block_bitmap(sb, group);
3747 if (bitmap_bh == NULL) { 3909 if (bitmap_bh == NULL) {
3748 /* error handling here */ 3910 /* error handling here */
3749 ext4_mb_release_desc(&e4b); 3911 ext4_mb_release_desc(&e4b);
@@ -4011,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4011 sbi = EXT4_SB(sb); 4173 sbi = EXT4_SB(sb);
4012 4174
4013 if (!test_opt(sb, MBALLOC)) { 4175 if (!test_opt(sb, MBALLOC)) {
4014 block = ext4_new_blocks_old(handle, ar->inode, ar->goal, 4176 block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
4015 &(ar->len), errp); 4177 &(ar->len), errp);
4016 return block; 4178 return block;
4017 } 4179 }
4180 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
4181 /*
4182 * With delalloc we already reserved the blocks
4183 */
4184 ar->len = ext4_has_free_blocks(sbi, ar->len);
4185 }
4186
4187 if (ar->len == 0) {
4188 *errp = -ENOSPC;
4189 return 0;
4190 }
4018 4191
4019 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { 4192 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
4020 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4193 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4026,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4026 } 4199 }
4027 inquota = ar->len; 4200 inquota = ar->len;
4028 4201
4202 if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
4203 ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4204
4029 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4205 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4030 if (!ac) { 4206 if (!ac) {
4207 ar->len = 0;
4031 *errp = -ENOMEM; 4208 *errp = -ENOMEM;
4032 return 0; 4209 goto out1;
4033 } 4210 }
4034 4211
4035 ext4_mb_poll_new_transaction(sb, handle); 4212 ext4_mb_poll_new_transaction(sb, handle);
@@ -4037,12 +4214,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4037 *errp = ext4_mb_initialize_context(ac, ar); 4214 *errp = ext4_mb_initialize_context(ac, ar);
4038 if (*errp) { 4215 if (*errp) {
4039 ar->len = 0; 4216 ar->len = 0;
4040 goto out; 4217 goto out2;
4041 } 4218 }
4042 4219
4043 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 4220 ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4044 if (!ext4_mb_use_preallocated(ac)) { 4221 if (!ext4_mb_use_preallocated(ac)) {
4045
4046 ac->ac_op = EXT4_MB_HISTORY_ALLOC; 4222 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
4047 ext4_mb_normalize_request(ac, ar); 4223 ext4_mb_normalize_request(ac, ar);
4048repeat: 4224repeat:
@@ -4085,11 +4261,12 @@ repeat:
4085 4261
4086 ext4_mb_release_context(ac); 4262 ext4_mb_release_context(ac);
4087 4263
4088out: 4264out2:
4265 kmem_cache_free(ext4_ac_cachep, ac);
4266out1:
4089 if (ar->len < inquota) 4267 if (ar->len < inquota)
4090 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); 4268 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
4091 4269
4092 kmem_cache_free(ext4_ac_cachep, ac);
4093 return block; 4270 return block;
4094} 4271}
4095static void ext4_mb_poll_new_transaction(struct super_block *sb, 4272static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4242,7 +4419,7 @@ do_more:
4242 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); 4419 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
4243 count -= overflow; 4420 count -= overflow;
4244 } 4421 }
4245 bitmap_bh = read_block_bitmap(sb, block_group); 4422 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4246 if (!bitmap_bh) 4423 if (!bitmap_bh)
4247 goto error_return; 4424 goto error_return;
4248 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 4425 gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4309,10 +4486,9 @@ do_more:
4309 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); 4486 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
4310 } else { 4487 } else {
4311 ext4_lock_group(sb, block_group); 4488 ext4_lock_group(sb, block_group);
4312 err = mb_free_blocks(inode, &e4b, bit, count); 4489 mb_free_blocks(inode, &e4b, bit, count);
4313 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4490 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4314 ext4_unlock_group(sb, block_group); 4491 ext4_unlock_group(sb, block_group);
4315 BUG_ON(err != 0);
4316 } 4492 }
4317 4493
4318 spin_lock(sb_bgl_lock(sbi, block_group)); 4494 spin_lock(sb_bgl_lock(sbi, block_group));
@@ -4321,6 +4497,13 @@ do_more:
4321 spin_unlock(sb_bgl_lock(sbi, block_group)); 4497 spin_unlock(sb_bgl_lock(sbi, block_group));
4322 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4498 percpu_counter_add(&sbi->s_freeblocks_counter, count);
4323 4499
4500 if (sbi->s_log_groups_per_flex) {
4501 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4502 spin_lock(sb_bgl_lock(sbi, flex_group));
4503 sbi->s_flex_groups[flex_group].free_blocks += count;
4504 spin_unlock(sb_bgl_lock(sbi, flex_group));
4505 }
4506
4324 ext4_mb_release_desc(&e4b); 4507 ext4_mb_release_desc(&e4b);
4325 4508
4326 *freed += count; 4509 *freed += count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830d..387ad98350c3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
183 struct inode *inode); 183 struct inode *inode);
184 184
185/* 185/*
186 * p is at least 6 bytes before the end of page
187 */
188static inline struct ext4_dir_entry_2 *
189ext4_next_entry(struct ext4_dir_entry_2 *p)
190{
191 return (struct ext4_dir_entry_2 *)((char *)p +
192 ext4_rec_len_from_disk(p->rec_len));
193}
194
195/*
186 * Future: use high four bits of block for coalesce-on-delete flags 196 * Future: use high four bits of block for coalesce-on-delete flags
187 * Mask them off for now. 197 * Mask them off for now.
188 */ 198 */
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
231{ 241{
232 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - 242 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
233 EXT4_DIR_REC_LEN(2) - infosize; 243 EXT4_DIR_REC_LEN(2) - infosize;
234 return 0? 20: entry_space / sizeof(struct dx_entry); 244 return entry_space / sizeof(struct dx_entry);
235} 245}
236 246
237static inline unsigned dx_node_limit (struct inode *dir) 247static inline unsigned dx_node_limit (struct inode *dir)
238{ 248{
239 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); 249 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
240 return 0? 22: entry_space / sizeof(struct dx_entry); 250 return entry_space / sizeof(struct dx_entry);
241} 251}
242 252
243/* 253/*
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
554 564
555 565
556/* 566/*
557 * p is at least 6 bytes before the end of page
558 */
559static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
560{
561 return (struct ext4_dir_entry_2 *)((char *)p +
562 ext4_rec_len_from_disk(p->rec_len));
563}
564
565/*
566 * This function fills a red-black tree with information from a 567 * This function fills a red-black tree with information from a
567 * directory block. It returns the number directory entries loaded 568 * directory block. It returns the number directory entries loaded
568 * into the tree. If there is an error it is returned in err. 569 * into the tree. If there is an error it is returned in err.
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
993 de = (struct ext4_dir_entry_2 *) bh->b_data; 994 de = (struct ext4_dir_entry_2 *) bh->b_data;
994 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - 995 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
995 EXT4_DIR_REC_LEN(0)); 996 EXT4_DIR_REC_LEN(0));
996 for (; de < top; de = ext4_next_entry(de)) 997 for (; de < top; de = ext4_next_entry(de)) {
997 if (ext4_match (namelen, name, de)) { 998 int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
998 if (!ext4_check_dir_entry("ext4_find_entry", 999 + ((char *) de - bh->b_data);
999 dir, de, bh, 1000
1000 (block<<EXT4_BLOCK_SIZE_BITS(sb)) 1001 if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
1001 +((char *)de - bh->b_data))) { 1002 brelse(bh);
1002 brelse (bh);
1003 *err = ERR_BAD_DX_DIR; 1003 *err = ERR_BAD_DX_DIR;
1004 goto errout; 1004 goto errout;
1005 } 1005 }
1006 *res_dir = de; 1006
1007 dx_release (frames); 1007 if (ext4_match(namelen, name, de)) {
1008 return bh; 1008 *res_dir = de;
1009 dx_release(frames);
1010 return bh;
1011 }
1009 } 1012 }
1010 brelse (bh); 1013 brelse (bh);
1011 /* Check to see if we should continue to search */ 1014 /* Check to see if we should continue to search */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9ff7b1c04239..f000fbe2cd93 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -866,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
866 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); 866 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
867 867
868 /* 868 /*
869 * We can allocate memory for mb_alloc based on the new group
870 * descriptor
871 */
872 if (test_opt(sb, MBALLOC)) {
873 err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
874 if (err)
875 goto exit_journal;
876 }
877 /*
869 * Make the new blocks and inodes valid next. We do this before 878 * Make the new blocks and inodes valid next. We do this before
870 * increasing the group count so that once the group is enabled, 879 * increasing the group count so that once the group is enabled,
871 * all of its blocks and inodes are already valid. 880 * all of its blocks and inodes are already valid.
@@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
957 handle_t *handle; 966 handle_t *handle;
958 int err; 967 int err;
959 unsigned long freed_blocks; 968 unsigned long freed_blocks;
969 ext4_group_t group;
970 struct ext4_group_info *grp;
960 971
961 /* We don't need to worry about locking wrt other resizers just 972 /* We don't need to worry about locking wrt other resizers just
962 * yet: we're going to revalidate es->s_blocks_count after 973 * yet: we're going to revalidate es->s_blocks_count after
@@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
988 } 999 }
989 1000
990 /* Handle the remaining blocks in the last group only. */ 1001 /* Handle the remaining blocks in the last group only. */
991 ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last); 1002 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
992 1003
993 if (last == 0) { 1004 if (last == 0) {
994 ext4_warning(sb, __func__, 1005 ext4_warning(sb, __func__,
@@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1060 o_blocks_count + add); 1071 o_blocks_count + add);
1061 if ((err = ext4_journal_stop(handle))) 1072 if ((err = ext4_journal_stop(handle)))
1062 goto exit_put; 1073 goto exit_put;
1074
1075 /*
1076 * Mark mballoc pages as not up to date so that they will be updated
1077 * next time they are loaded by ext4_mb_load_buddy.
1078 */
1079 if (test_opt(sb, MBALLOC)) {
1080 struct ext4_sb_info *sbi = EXT4_SB(sb);
1081 struct inode *inode = sbi->s_buddy_cache;
1082 int blocks_per_page;
1083 int block;
1084 int pnum;
1085 struct page *page;
1086
1087 /* Set buddy page as not up to date */
1088 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1089 block = group * 2;
1090 pnum = block / blocks_per_page;
1091 page = find_get_page(inode->i_mapping, pnum);
1092 if (page != NULL) {
1093 ClearPageUptodate(page);
1094 page_cache_release(page);
1095 }
1096
1097 /* Set bitmap page as not up to date */
1098 block++;
1099 pnum = block / blocks_per_page;
1100 page = find_get_page(inode->i_mapping, pnum);
1101 if (page != NULL) {
1102 ClearPageUptodate(page);
1103 page_cache_release(page);
1104 }
1105
1106 /* Get the info on the last group */
1107 grp = ext4_get_group_info(sb, group);
1108
1109 /* Update free blocks in group info */
1110 ext4_mb_update_group_info(grp, add);
1111 }
1112
1063 if (test_opt(sb, DEBUG)) 1113 if (test_opt(sb, DEBUG))
1064 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", 1114 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
1065 ext4_blocks_count(es)); 1115 ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cb96f127c366..1cb371dcd609 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb)
506 ext4_ext_release(sb); 506 ext4_ext_release(sb);
507 ext4_xattr_put_super(sb); 507 ext4_xattr_put_super(sb);
508 jbd2_journal_destroy(sbi->s_journal); 508 jbd2_journal_destroy(sbi->s_journal);
509 sbi->s_journal = NULL;
509 if (!(sb->s_flags & MS_RDONLY)) { 510 if (!(sb->s_flags & MS_RDONLY)) {
510 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 511 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
511 es->s_state = cpu_to_le16(sbi->s_mount_state); 512 es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
517 for (i = 0; i < sbi->s_gdb_count; i++) 518 for (i = 0; i < sbi->s_gdb_count; i++)
518 brelse(sbi->s_group_desc[i]); 519 brelse(sbi->s_group_desc[i]);
519 kfree(sbi->s_group_desc); 520 kfree(sbi->s_group_desc);
521 kfree(sbi->s_flex_groups);
520 percpu_counter_destroy(&sbi->s_freeblocks_counter); 522 percpu_counter_destroy(&sbi->s_freeblocks_counter);
521 percpu_counter_destroy(&sbi->s_freeinodes_counter); 523 percpu_counter_destroy(&sbi->s_freeinodes_counter);
522 percpu_counter_destroy(&sbi->s_dirs_counter); 524 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
571 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 573 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
572 INIT_LIST_HEAD(&ei->i_prealloc_list); 574 INIT_LIST_HEAD(&ei->i_prealloc_list);
573 spin_lock_init(&ei->i_prealloc_lock); 575 spin_lock_init(&ei->i_prealloc_lock);
576 jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
577 ei->i_reserved_data_blocks = 0;
578 ei->i_reserved_meta_blocks = 0;
579 ei->i_allocated_meta_blocks = 0;
580 ei->i_delalloc_reserved_flag = 0;
581 spin_lock_init(&(ei->i_block_reservation_lock));
574 return &ei->vfs_inode; 582 return &ei->vfs_inode;
575} 583}
576 584
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode)
635 EXT4_I(inode)->i_block_alloc_info = NULL; 643 EXT4_I(inode)->i_block_alloc_info = NULL;
636 if (unlikely(rsv)) 644 if (unlikely(rsv))
637 kfree(rsv); 645 kfree(rsv);
646 jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
647 &EXT4_I(inode)->jinode);
638} 648}
639 649
640static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) 650static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
671 unsigned long def_mount_opts; 681 unsigned long def_mount_opts;
672 struct super_block *sb = vfs->mnt_sb; 682 struct super_block *sb = vfs->mnt_sb;
673 struct ext4_sb_info *sbi = EXT4_SB(sb); 683 struct ext4_sb_info *sbi = EXT4_SB(sb);
674 journal_t *journal = sbi->s_journal;
675 struct ext4_super_block *es = sbi->s_es; 684 struct ext4_super_block *es = sbi->s_es;
676 685
677 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 686 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
747 seq_puts(seq, ",nomballoc"); 756 seq_puts(seq, ",nomballoc");
748 if (test_opt(sb, I_VERSION)) 757 if (test_opt(sb, I_VERSION))
749 seq_puts(seq, ",i_version"); 758 seq_puts(seq, ",i_version");
759 if (!test_opt(sb, DELALLOC))
760 seq_puts(seq, ",nodelalloc");
761
750 762
751 if (sbi->s_stripe) 763 if (sbi->s_stripe)
752 seq_printf(seq, ",stripe=%lu", sbi->s_stripe); 764 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -894,7 +906,7 @@ enum {
894 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 906 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
895 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 907 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
896 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 908 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
897 Opt_mballoc, Opt_nomballoc, Opt_stripe, 909 Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
898}; 910};
899 911
900static match_table_t tokens = { 912static match_table_t tokens = {
@@ -953,6 +965,8 @@ static match_table_t tokens = {
953 {Opt_nomballoc, "nomballoc"}, 965 {Opt_nomballoc, "nomballoc"},
954 {Opt_stripe, "stripe=%u"}, 966 {Opt_stripe, "stripe=%u"},
955 {Opt_resize, "resize"}, 967 {Opt_resize, "resize"},
968 {Opt_delalloc, "delalloc"},
969 {Opt_nodelalloc, "nodelalloc"},
956 {Opt_err, NULL}, 970 {Opt_err, NULL},
957}; 971};
958 972
@@ -990,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb,
990 int qtype, qfmt; 1004 int qtype, qfmt;
991 char *qname; 1005 char *qname;
992#endif 1006#endif
1007 ext4_fsblk_t last_block;
993 1008
994 if (!options) 1009 if (!options)
995 return 1; 1010 return 1;
@@ -1309,15 +1324,39 @@ set_qf_format:
1309 clear_opt(sbi->s_mount_opt, NOBH); 1324 clear_opt(sbi->s_mount_opt, NOBH);
1310 break; 1325 break;
1311 case Opt_extents: 1326 case Opt_extents:
1327 if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
1328 EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1329 ext4_warning(sb, __func__,
1330 "extents feature not enabled "
1331 "on this filesystem, use tune2fs\n");
1332 return 0;
1333 }
1312 set_opt (sbi->s_mount_opt, EXTENTS); 1334 set_opt (sbi->s_mount_opt, EXTENTS);
1313 break; 1335 break;
1314 case Opt_noextents: 1336 case Opt_noextents:
1337 /*
1338 * When e2fsprogs support resizing an already existing
1339 * ext3 file system to greater than 2**32 we need to
1340 * add support to block allocator to handle growing
1341 * already existing block mapped inode so that blocks
1342 * allocated for them fall within 2**32
1343 */
1344 last_block = ext4_blocks_count(sbi->s_es) - 1;
1345 if (last_block > 0xffffffffULL) {
1346 printk(KERN_ERR "EXT4-fs: Filesystem too "
1347 "large to mount with "
1348 "-o noextents options\n");
1349 return 0;
1350 }
1315 clear_opt (sbi->s_mount_opt, EXTENTS); 1351 clear_opt (sbi->s_mount_opt, EXTENTS);
1316 break; 1352 break;
1317 case Opt_i_version: 1353 case Opt_i_version:
1318 set_opt(sbi->s_mount_opt, I_VERSION); 1354 set_opt(sbi->s_mount_opt, I_VERSION);
1319 sb->s_flags |= MS_I_VERSION; 1355 sb->s_flags |= MS_I_VERSION;
1320 break; 1356 break;
1357 case Opt_nodelalloc:
1358 clear_opt(sbi->s_mount_opt, DELALLOC);
1359 break;
1321 case Opt_mballoc: 1360 case Opt_mballoc:
1322 set_opt(sbi->s_mount_opt, MBALLOC); 1361 set_opt(sbi->s_mount_opt, MBALLOC);
1323 break; 1362 break;
@@ -1331,6 +1370,9 @@ set_qf_format:
1331 return 0; 1370 return 0;
1332 sbi->s_stripe = option; 1371 sbi->s_stripe = option;
1333 break; 1372 break;
1373 case Opt_delalloc:
1374 set_opt(sbi->s_mount_opt, DELALLOC);
1375 break;
1334 default: 1376 default:
1335 printk (KERN_ERR 1377 printk (KERN_ERR
1336 "EXT4-fs: Unrecognized mount option \"%s\" " 1378 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1443,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1443 return res; 1485 return res;
1444} 1486}
1445 1487
1488static int ext4_fill_flex_info(struct super_block *sb)
1489{
1490 struct ext4_sb_info *sbi = EXT4_SB(sb);
1491 struct ext4_group_desc *gdp = NULL;
1492 struct buffer_head *bh;
1493 ext4_group_t flex_group_count;
1494 ext4_group_t flex_group;
1495 int groups_per_flex = 0;
1496 __u64 block_bitmap = 0;
1497 int i;
1498
1499 if (!sbi->s_es->s_log_groups_per_flex) {
1500 sbi->s_log_groups_per_flex = 0;
1501 return 1;
1502 }
1503
1504 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1505 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1506
1507 flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
1508 groups_per_flex;
1509 sbi->s_flex_groups = kmalloc(flex_group_count *
1510 sizeof(struct flex_groups), GFP_KERNEL);
1511 if (sbi->s_flex_groups == NULL) {
1512 printk(KERN_ERR "EXT4-fs: not enough memory\n");
1513 goto failed;
1514 }
1515 memset(sbi->s_flex_groups, 0, flex_group_count *
1516 sizeof(struct flex_groups));
1517
1518 gdp = ext4_get_group_desc(sb, 1, &bh);
1519 block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
1520
1521 for (i = 0; i < sbi->s_groups_count; i++) {
1522 gdp = ext4_get_group_desc(sb, i, &bh);
1523
1524 flex_group = ext4_flex_group(sbi, i);
1525 sbi->s_flex_groups[flex_group].free_inodes +=
1526 le16_to_cpu(gdp->bg_free_inodes_count);
1527 sbi->s_flex_groups[flex_group].free_blocks +=
1528 le16_to_cpu(gdp->bg_free_blocks_count);
1529 }
1530
1531 return 1;
1532failed:
1533 return 0;
1534}
1535
1446__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, 1536__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
1447 struct ext4_group_desc *gdp) 1537 struct ext4_group_desc *gdp)
1448{ 1538{
@@ -1810,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
1810} 1900}
1811 1901
1812static int ext4_fill_super (struct super_block *sb, void *data, int silent) 1902static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1813 __releases(kernel_sem) 1903 __releases(kernel_lock)
1814 __acquires(kernel_sem) 1904 __acquires(kernel_lock)
1815 1905
1816{ 1906{
1817 struct buffer_head * bh; 1907 struct buffer_head * bh;
@@ -1851,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1851 goto out_fail; 1941 goto out_fail;
1852 } 1942 }
1853 1943
1854 if (!sb_set_blocksize(sb, blocksize)) {
1855 printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
1856 goto out_fail;
1857 }
1858
1859 /* 1944 /*
1860 * The ext4 superblock will not be buffer aligned for other than 1kB 1945 * The ext4 superblock will not be buffer aligned for other than 1kB
1861 * block sizes. We need to calculate the offset from buffer start. 1946 * block sizes. We need to calculate the offset from buffer start.
@@ -1919,15 +2004,28 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1919 2004
1920 /* 2005 /*
1921 * turn on extents feature by default in ext4 filesystem 2006 * turn on extents feature by default in ext4 filesystem
1922 * User -o noextents to turn it off 2007 * only if feature flag already set by mkfs or tune2fs.
2008 * Use -o noextents to turn it off
1923 */ 2009 */
1924 set_opt(sbi->s_mount_opt, EXTENTS); 2010 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2011 set_opt(sbi->s_mount_opt, EXTENTS);
2012 else
2013 ext4_warning(sb, __func__,
2014 "extents feature not enabled on this filesystem, "
2015 "use tune2fs.\n");
1925 /* 2016 /*
1926 * turn on mballoc feature by default in ext4 filesystem 2017 * turn on mballoc code by default in ext4 filesystem
1927 * User -o nomballoc to turn it off 2018 * Use -o nomballoc to turn it off
1928 */ 2019 */
1929 set_opt(sbi->s_mount_opt, MBALLOC); 2020 set_opt(sbi->s_mount_opt, MBALLOC);
1930 2021
2022 /*
2023 * enable delayed allocation by default
2024 * Use -o nodelalloc to turn it off
2025 */
2026 set_opt(sbi->s_mount_opt, DELALLOC);
2027
2028
1931 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, 2029 if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
1932 NULL, 0)) 2030 NULL, 0))
1933 goto failed_mount; 2031 goto failed_mount;
@@ -2138,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
2138 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); 2236 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
2139 goto failed_mount2; 2237 goto failed_mount2;
2140 } 2238 }
2239 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
2240 if (!ext4_fill_flex_info(sb)) {
2241 printk(KERN_ERR
2242 "EXT4-fs: unable to initialize "
2243 "flex_bg meta info!\n");
2244 goto failed_mount2;
2245 }
2246
2141 sbi->s_gdb_count = db_count; 2247 sbi->s_gdb_count = db_count;
2142 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 2248 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
2143 spin_lock_init(&sbi->s_next_gen_lock); 2249 spin_lock_init(&sbi->s_next_gen_lock);
@@ -2358,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
2358 test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": 2464 test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
2359 "writeback"); 2465 "writeback");
2360 2466
2467 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
2468 printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
2469 "requested data journaling mode\n");
2470 clear_opt(sbi->s_mount_opt, DELALLOC);
2471 } else if (test_opt(sb, DELALLOC))
2472 printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
2473
2361 ext4_ext_init(sb); 2474 ext4_ext_init(sb);
2362 ext4_mb_init(sb, needs_recovery); 2475 ext4_mb_init(sb, needs_recovery);
2363 2476
@@ -2372,6 +2485,7 @@ cantfind_ext4:
2372 2485
2373failed_mount4: 2486failed_mount4:
2374 jbd2_journal_destroy(sbi->s_journal); 2487 jbd2_journal_destroy(sbi->s_journal);
2488 sbi->s_journal = NULL;
2375failed_mount3: 2489failed_mount3:
2376 percpu_counter_destroy(&sbi->s_freeblocks_counter); 2490 percpu_counter_destroy(&sbi->s_freeblocks_counter);
2377 percpu_counter_destroy(&sbi->s_freeinodes_counter); 2491 percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3325,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3325 err = ext4_journal_dirty_metadata(handle, bh); 3439 err = ext4_journal_dirty_metadata(handle, bh);
3326 else { 3440 else {
3327 /* Always do at least ordered writes for quotas */ 3441 /* Always do at least ordered writes for quotas */
3328 err = ext4_journal_dirty_data(handle, bh); 3442 err = ext4_jbd2_file_inode(handle, inode);
3329 mark_buffer_dirty(bh); 3443 mark_buffer_dirty(bh);
3330 } 3444 }
3331 brelse(bh); 3445 brelse(bh);
@@ -3337,8 +3451,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
3337 blk++; 3451 blk++;
3338 } 3452 }
3339out: 3453out:
3340 if (len == towrite) 3454 if (len == towrite) {
3455 mutex_unlock(&inode->i_mutex);
3341 return err; 3456 return err;
3457 }
3342 if (inode->i_size < off+len-towrite) { 3458 if (inode->i_size < off+len-towrite) {
3343 i_size_write(inode, off+len-towrite); 3459 i_size_write(inode, off+len-towrite);
3344 EXT4_I(inode)->i_disksize = inode->i_size; 3460 EXT4_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ff08633f398e..93c5fdcdad2e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,7 +810,7 @@ inserted:
810 /* We need to allocate a new block */ 810 /* We need to allocate a new block */
811 ext4_fsblk_t goal = ext4_group_first_block_no(sb, 811 ext4_fsblk_t goal = ext4_group_first_block_no(sb,
812 EXT4_I(inode)->i_block_group); 812 EXT4_I(inode)->i_block_group);
813 ext4_fsblk_t block = ext4_new_block(handle, inode, 813 ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
814 goal, &error); 814 goal, &error);
815 if (error) 815 if (error)
816 goto cleanup; 816 goto cleanup;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cadc..ac1a52cf2a37 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
13#include "ext4.h" 13#include "ext4.h"
14#include "xattr.h" 14#include "xattr.h"
15 15
16#define XATTR_TRUSTED_PREFIX "trusted."
17
18static size_t 16static size_t
19ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, 17ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
20 const char *name, size_t name_len) 18 const char *name, size_t name_len)
21{ 19{
22 const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; 20 const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
23 const size_t total_len = prefix_len + name_len + 1; 21 const size_t total_len = prefix_len + name_len + 1;
24 22
25 if (!capable(CAP_SYS_ADMIN)) 23 if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4e..d91aa61b42aa 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
12#include "ext4.h" 12#include "ext4.h"
13#include "xattr.h" 13#include "xattr.h"
14 14
15#define XATTR_USER_PREFIX "user."
16
17static size_t 15static size_t
18ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, 16ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
19 const char *name, size_t name_len) 17 const char *name, size_t name_len)
20{ 18{
21 const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; 19 const size_t prefix_len = XATTR_USER_PREFIX_LEN;
22 const size_t total_len = prefix_len + name_len + 1; 20 const size_t total_len = prefix_len + name_len + 1;
23 21
24 if (!test_opt(inode->i_sb, XATTR_USER)) 22 if (!test_opt(inode->i_sb, XATTR_USER))