aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
authorTheodore Ts'o <tytso@mit.edu>2009-12-30 14:20:45 -0500
committerTheodore Ts'o <tytso@mit.edu>2009-12-30 14:20:45 -0500
commit0637c6f4135f592f094207c7c21e7c0fc5557834 (patch)
treeee76fc861dffa902e80d0d11c681f5dfa2fcc569 /fs/ext4
parent515f41c33a9d44a964264c9511ad2c869af1fac3 (diff)
ext4: Patch up how we claim metadata blocks for quota purposes
As reported in Kernel Bugzilla #14936, commit d21cd8f triggered a BUG in the function ext4_da_update_reserve_space() found in fs/ext4/inode.c. The root cause of this BUG() was caused by the fact that ext4_calc_metadata_amount() can severely over-estimate how many metadata blocks will be needed, especially when using direct block-mapped files. In addition, it can also badly *under* estimate how much space is needed, since ext4_calc_metadata_amount() assumes that the blocks are contiguous, and this is not always true. If the application is writing blocks to a sparse file, the number of metadata blocks necessary can be severly underestimated by the functions ext4_da_reserve_space(), ext4_da_update_reserve_space() and ext4_da_release_space(). This was the cause of the dq_claim_space reports found on kerneloops.org. Unfortunately, doing this right means that we need to massively over-estimate the amount of free space needed. So in some cases we may need to force the inode to be written to disk asynchronously in to avoid spurious quota failures. http://bugzilla.kernel.org/show_bug.cgi?id=14936 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/inode.c157
1 files changed, 84 insertions, 73 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e3b45458ef8..84eeb8f515a3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1043,43 +1043,47 @@ static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
1043 return ext4_indirect_calc_metadata_amount(inode, blocks); 1043 return ext4_indirect_calc_metadata_amount(inode, blocks);
1044} 1044}
1045 1045
1046/*
1047 * Called with i_data_sem down, which is important since we can call
1048 * ext4_discard_preallocations() from here.
1049 */
1046static void ext4_da_update_reserve_space(struct inode *inode, int used) 1050static void ext4_da_update_reserve_space(struct inode *inode, int used)
1047{ 1051{
1048 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1052 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1049 int total, mdb, mdb_free, mdb_claim = 0; 1053 struct ext4_inode_info *ei = EXT4_I(inode);
1050 1054 int mdb_free = 0;
1051 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1055
1052 /* recalculate the number of metablocks still need to be reserved */ 1056 spin_lock(&ei->i_block_reservation_lock);
1053 total = EXT4_I(inode)->i_reserved_data_blocks - used; 1057 if (unlikely(used > ei->i_reserved_data_blocks)) {
1054 mdb = ext4_calc_metadata_amount(inode, total); 1058 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
1055 1059 "with only %d reserved data blocks\n",
1056 /* figure out how many metablocks to release */ 1060 __func__, inode->i_ino, used,
1057 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1061 ei->i_reserved_data_blocks);
1058 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1062 WARN_ON(1);
1059 1063 used = ei->i_reserved_data_blocks;
1060 if (mdb_free) { 1064 }
1061 /* Account for allocated meta_blocks */ 1065
1062 mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks; 1066 /* Update per-inode reservations */
1063 BUG_ON(mdb_free < mdb_claim); 1067 ei->i_reserved_data_blocks -= used;
1064 mdb_free -= mdb_claim; 1068 used += ei->i_allocated_meta_blocks;
1065 1069 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1066 /* update fs dirty blocks counter */ 1070 ei->i_allocated_meta_blocks = 0;
1071 percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
1072
1073 if (ei->i_reserved_data_blocks == 0) {
1074 /*
1075 * We can release all of the reserved metadata blocks
1076 * only when we have written all of the delayed
1077 * allocation blocks.
1078 */
1079 mdb_free = ei->i_allocated_meta_blocks;
1067 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); 1080 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1068 EXT4_I(inode)->i_allocated_meta_blocks = 0; 1081 ei->i_allocated_meta_blocks = 0;
1069 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1070 } 1082 }
1071
1072 /* update per-inode reservations */
1073 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
1074 EXT4_I(inode)->i_reserved_data_blocks -= used;
1075 percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim);
1076 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1083 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1077 1084
1078 vfs_dq_claim_block(inode, used + mdb_claim); 1085 /* Update quota subsystem */
1079 1086 vfs_dq_claim_block(inode, used);
1080 /*
1081 * free those over-booking quota for metadata blocks
1082 */
1083 if (mdb_free) 1087 if (mdb_free)
1084 vfs_dq_release_reservation_block(inode, mdb_free); 1088 vfs_dq_release_reservation_block(inode, mdb_free);
1085 1089
@@ -1088,7 +1092,8 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1088 * there aren't any writers on the inode, we can discard the 1092 * there aren't any writers on the inode, we can discard the
1089 * inode's preallocations. 1093 * inode's preallocations.
1090 */ 1094 */
1091 if (!total && (atomic_read(&inode->i_writecount) == 0)) 1095 if ((ei->i_reserved_data_blocks == 0) &&
1096 (atomic_read(&inode->i_writecount) == 0))
1092 ext4_discard_preallocations(inode); 1097 ext4_discard_preallocations(inode);
1093} 1098}
1094 1099
@@ -1801,7 +1806,8 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1801{ 1806{
1802 int retries = 0; 1807 int retries = 0;
1803 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1808 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1804 unsigned long md_needed, mdblocks, total = 0; 1809 struct ext4_inode_info *ei = EXT4_I(inode);
1810 unsigned long md_needed, md_reserved, total = 0;
1805 1811
1806 /* 1812 /*
1807 * recalculate the amount of metadata blocks to reserve 1813 * recalculate the amount of metadata blocks to reserve
@@ -1809,35 +1815,44 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
1809 * worse case is one extent per block 1815 * worse case is one extent per block
1810 */ 1816 */
1811repeat: 1817repeat:
1812 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1818 spin_lock(&ei->i_block_reservation_lock);
1813 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; 1819 md_reserved = ei->i_reserved_meta_blocks;
1814 mdblocks = ext4_calc_metadata_amount(inode, total); 1820 md_needed = ext4_calc_metadata_amount(inode, nrblocks);
1815 BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
1816
1817 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
1818 total = md_needed + nrblocks; 1821 total = md_needed + nrblocks;
1819 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1822 spin_unlock(&ei->i_block_reservation_lock);
1820 1823
1821 /* 1824 /*
1822 * Make quota reservation here to prevent quota overflow 1825 * Make quota reservation here to prevent quota overflow
1823 * later. Real quota accounting is done at pages writeout 1826 * later. Real quota accounting is done at pages writeout
1824 * time. 1827 * time.
1825 */ 1828 */
1826 if (vfs_dq_reserve_block(inode, total)) 1829 if (vfs_dq_reserve_block(inode, total)) {
1830 /*
1831 * We tend to badly over-estimate the amount of
1832 * metadata blocks which are needed, so if we have
1833 * reserved any metadata blocks, try to force out the
1834 * inode and see if we have any better luck.
1835 */
1836 if (md_reserved && retries++ <= 3)
1837 goto retry;
1827 return -EDQUOT; 1838 return -EDQUOT;
1839 }
1828 1840
1829 if (ext4_claim_free_blocks(sbi, total)) { 1841 if (ext4_claim_free_blocks(sbi, total)) {
1830 vfs_dq_release_reservation_block(inode, total); 1842 vfs_dq_release_reservation_block(inode, total);
1831 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1843 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1844 retry:
1845 if (md_reserved)
1846 write_inode_now(inode, (retries == 3));
1832 yield(); 1847 yield();
1833 goto repeat; 1848 goto repeat;
1834 } 1849 }
1835 return -ENOSPC; 1850 return -ENOSPC;
1836 } 1851 }
1837 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1852 spin_lock(&ei->i_block_reservation_lock);
1838 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1853 ei->i_reserved_data_blocks += nrblocks;
1839 EXT4_I(inode)->i_reserved_meta_blocks += md_needed; 1854 ei->i_reserved_meta_blocks += md_needed;
1840 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1855 spin_unlock(&ei->i_block_reservation_lock);
1841 1856
1842 return 0; /* success */ 1857 return 0; /* success */
1843} 1858}
@@ -1845,49 +1860,45 @@ repeat:
1845static void ext4_da_release_space(struct inode *inode, int to_free) 1860static void ext4_da_release_space(struct inode *inode, int to_free)
1846{ 1861{
1847 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1862 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1848 int total, mdb, mdb_free, release; 1863 struct ext4_inode_info *ei = EXT4_I(inode);
1849 1864
1850 if (!to_free) 1865 if (!to_free)
1851 return; /* Nothing to release, exit */ 1866 return; /* Nothing to release, exit */
1852 1867
1853 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1868 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1854 1869
1855 if (!EXT4_I(inode)->i_reserved_data_blocks) { 1870 if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1856 /* 1871 /*
1857 * if there is no reserved blocks, but we try to free some 1872 * if there aren't enough reserved blocks, then the
1858 * then the counter is messed up somewhere. 1873 * counter is messed up somewhere. Since this
1859 * but since this function is called from invalidate 1874 * function is called from invalidate page, it's
1860 * page, it's harmless to return without any action 1875 * harmless to return without any action.
1861 */ 1876 */
1862 printk(KERN_INFO "ext4 delalloc try to release %d reserved " 1877 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1863 "blocks for inode %lu, but there is no reserved " 1878 "ino %lu, to_free %d with only %d reserved "
1864 "data blocks\n", to_free, inode->i_ino); 1879 "data blocks\n", inode->i_ino, to_free,
1865 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1880 ei->i_reserved_data_blocks);
1866 return; 1881 WARN_ON(1);
1882 to_free = ei->i_reserved_data_blocks;
1867 } 1883 }
1884 ei->i_reserved_data_blocks -= to_free;
1868 1885
1869 /* recalculate the number of metablocks still need to be reserved */ 1886 if (ei->i_reserved_data_blocks == 0) {
1870 total = EXT4_I(inode)->i_reserved_data_blocks - to_free; 1887 /*
1871 mdb = ext4_calc_metadata_amount(inode, total); 1888 * We can release all of the reserved metadata blocks
1872 1889 * only when we have written all of the delayed
1873 /* figure out how many metablocks to release */ 1890 * allocation blocks.
1874 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1891 */
1875 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1892 to_free += ei->i_allocated_meta_blocks;
1876 1893 ei->i_allocated_meta_blocks = 0;
1877 release = to_free + mdb_free; 1894 }
1878
1879 /* update fs dirty blocks counter for truncate case */
1880 percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
1881 1895
1882 /* update per-inode reservations */ 1896 /* update fs dirty blocks counter */
1883 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); 1897 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1884 EXT4_I(inode)->i_reserved_data_blocks -= to_free;
1885 1898
1886 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
1887 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
1888 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1899 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1889 1900
1890 vfs_dq_release_reservation_block(inode, release); 1901 vfs_dq_release_reservation_block(inode, to_free);
1891} 1902}
1892 1903
1893static void ext4_da_page_release_reservation(struct page *page, 1904static void ext4_da_page_release_reservation(struct page *page,