diff options
author | Theodore Ts'o <tytso@mit.edu> | 2009-12-30 14:20:45 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2009-12-30 14:20:45 -0500 |
commit | 0637c6f4135f592f094207c7c21e7c0fc5557834 (patch) | |
tree | ee76fc861dffa902e80d0d11c681f5dfa2fcc569 /fs | |
parent | 515f41c33a9d44a964264c9511ad2c869af1fac3 (diff) |
ext4: Patch up how we claim metadata blocks for quota purposes
As reported in Kernel Bugzilla #14936, commit d21cd8f triggered a BUG
in the function ext4_da_update_reserve_space() found in
fs/ext4/inode.c. The root cause of this BUG() was caused by the fact
that ext4_calc_metadata_amount() can severely over-estimate how many
metadata blocks will be needed, especially when using direct
block-mapped files.
In addition, it can also badly *under* estimate how much space is
needed, since ext4_calc_metadata_amount() assumes that the blocks are
contiguous, and this is not always true. If the application is
writing blocks to a sparse file, the number of metadata blocks
necessary can be severly underestimated by the functions
ext4_da_reserve_space(), ext4_da_update_reserve_space() and
ext4_da_release_space(). This was the cause of the dq_claim_space
reports found on kerneloops.org.
Unfortunately, doing this right means that we need to massively
over-estimate the amount of free space needed. So in some cases we
may need to force the inode to be written to disk asynchronously in
to avoid spurious quota failures.
http://bugzilla.kernel.org/show_bug.cgi?id=14936
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ext4/inode.c | 157 |
1 files changed, 84 insertions, 73 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3e3b45458ef8..84eeb8f515a3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -1043,43 +1043,47 @@ static int ext4_calc_metadata_amount(struct inode *inode, int blocks) | |||
1043 | return ext4_indirect_calc_metadata_amount(inode, blocks); | 1043 | return ext4_indirect_calc_metadata_amount(inode, blocks); |
1044 | } | 1044 | } |
1045 | 1045 | ||
1046 | /* | ||
1047 | * Called with i_data_sem down, which is important since we can call | ||
1048 | * ext4_discard_preallocations() from here. | ||
1049 | */ | ||
1046 | static void ext4_da_update_reserve_space(struct inode *inode, int used) | 1050 | static void ext4_da_update_reserve_space(struct inode *inode, int used) |
1047 | { | 1051 | { |
1048 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1052 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1049 | int total, mdb, mdb_free, mdb_claim = 0; | 1053 | struct ext4_inode_info *ei = EXT4_I(inode); |
1050 | 1054 | int mdb_free = 0; | |
1051 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | 1055 | |
1052 | /* recalculate the number of metablocks still need to be reserved */ | 1056 | spin_lock(&ei->i_block_reservation_lock); |
1053 | total = EXT4_I(inode)->i_reserved_data_blocks - used; | 1057 | if (unlikely(used > ei->i_reserved_data_blocks)) { |
1054 | mdb = ext4_calc_metadata_amount(inode, total); | 1058 | ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " |
1055 | 1059 | "with only %d reserved data blocks\n", | |
1056 | /* figure out how many metablocks to release */ | 1060 | __func__, inode->i_ino, used, |
1057 | BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | 1061 | ei->i_reserved_data_blocks); |
1058 | mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; | 1062 | WARN_ON(1); |
1059 | 1063 | used = ei->i_reserved_data_blocks; | |
1060 | if (mdb_free) { | 1064 | } |
1061 | /* Account for allocated meta_blocks */ | 1065 | |
1062 | mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks; | 1066 | /* Update per-inode reservations */ |
1063 | BUG_ON(mdb_free < mdb_claim); | 1067 | ei->i_reserved_data_blocks -= used; |
1064 | mdb_free -= mdb_claim; | 1068 | used += ei->i_allocated_meta_blocks; |
1065 | 1069 | ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; | |
1066 | /* update fs dirty blocks counter */ | 1070 | ei->i_allocated_meta_blocks = 0; |
1071 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, used); | ||
1072 | |||
1073 | if (ei->i_reserved_data_blocks == 0) { | ||
1074 | /* | ||
1075 | * We can release all of the reserved metadata blocks | ||
1076 | * only when we have written all of the delayed | ||
1077 | * allocation blocks. | ||
1078 | */ | ||
1079 | mdb_free = ei->i_allocated_meta_blocks; | ||
1067 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); | 1080 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); |
1068 | EXT4_I(inode)->i_allocated_meta_blocks = 0; | 1081 | ei->i_allocated_meta_blocks = 0; |
1069 | EXT4_I(inode)->i_reserved_meta_blocks = mdb; | ||
1070 | } | 1082 | } |
1071 | |||
1072 | /* update per-inode reservations */ | ||
1073 | BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); | ||
1074 | EXT4_I(inode)->i_reserved_data_blocks -= used; | ||
1075 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim); | ||
1076 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1083 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); |
1077 | 1084 | ||
1078 | vfs_dq_claim_block(inode, used + mdb_claim); | 1085 | /* Update quota subsystem */ |
1079 | 1086 | vfs_dq_claim_block(inode, used); | |
1080 | /* | ||
1081 | * free those over-booking quota for metadata blocks | ||
1082 | */ | ||
1083 | if (mdb_free) | 1087 | if (mdb_free) |
1084 | vfs_dq_release_reservation_block(inode, mdb_free); | 1088 | vfs_dq_release_reservation_block(inode, mdb_free); |
1085 | 1089 | ||
@@ -1088,7 +1092,8 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) | |||
1088 | * there aren't any writers on the inode, we can discard the | 1092 | * there aren't any writers on the inode, we can discard the |
1089 | * inode's preallocations. | 1093 | * inode's preallocations. |
1090 | */ | 1094 | */ |
1091 | if (!total && (atomic_read(&inode->i_writecount) == 0)) | 1095 | if ((ei->i_reserved_data_blocks == 0) && |
1096 | (atomic_read(&inode->i_writecount) == 0)) | ||
1092 | ext4_discard_preallocations(inode); | 1097 | ext4_discard_preallocations(inode); |
1093 | } | 1098 | } |
1094 | 1099 | ||
@@ -1801,7 +1806,8 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) | |||
1801 | { | 1806 | { |
1802 | int retries = 0; | 1807 | int retries = 0; |
1803 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1808 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1804 | unsigned long md_needed, mdblocks, total = 0; | 1809 | struct ext4_inode_info *ei = EXT4_I(inode); |
1810 | unsigned long md_needed, md_reserved, total = 0; | ||
1805 | 1811 | ||
1806 | /* | 1812 | /* |
1807 | * recalculate the amount of metadata blocks to reserve | 1813 | * recalculate the amount of metadata blocks to reserve |
@@ -1809,35 +1815,44 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) | |||
1809 | * worse case is one extent per block | 1815 | * worse case is one extent per block |
1810 | */ | 1816 | */ |
1811 | repeat: | 1817 | repeat: |
1812 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | 1818 | spin_lock(&ei->i_block_reservation_lock); |
1813 | total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; | 1819 | md_reserved = ei->i_reserved_meta_blocks; |
1814 | mdblocks = ext4_calc_metadata_amount(inode, total); | 1820 | md_needed = ext4_calc_metadata_amount(inode, nrblocks); |
1815 | BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); | ||
1816 | |||
1817 | md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; | ||
1818 | total = md_needed + nrblocks; | 1821 | total = md_needed + nrblocks; |
1819 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1822 | spin_unlock(&ei->i_block_reservation_lock); |
1820 | 1823 | ||
1821 | /* | 1824 | /* |
1822 | * Make quota reservation here to prevent quota overflow | 1825 | * Make quota reservation here to prevent quota overflow |
1823 | * later. Real quota accounting is done at pages writeout | 1826 | * later. Real quota accounting is done at pages writeout |
1824 | * time. | 1827 | * time. |
1825 | */ | 1828 | */ |
1826 | if (vfs_dq_reserve_block(inode, total)) | 1829 | if (vfs_dq_reserve_block(inode, total)) { |
1830 | /* | ||
1831 | * We tend to badly over-estimate the amount of | ||
1832 | * metadata blocks which are needed, so if we have | ||
1833 | * reserved any metadata blocks, try to force out the | ||
1834 | * inode and see if we have any better luck. | ||
1835 | */ | ||
1836 | if (md_reserved && retries++ <= 3) | ||
1837 | goto retry; | ||
1827 | return -EDQUOT; | 1838 | return -EDQUOT; |
1839 | } | ||
1828 | 1840 | ||
1829 | if (ext4_claim_free_blocks(sbi, total)) { | 1841 | if (ext4_claim_free_blocks(sbi, total)) { |
1830 | vfs_dq_release_reservation_block(inode, total); | 1842 | vfs_dq_release_reservation_block(inode, total); |
1831 | if (ext4_should_retry_alloc(inode->i_sb, &retries)) { | 1843 | if (ext4_should_retry_alloc(inode->i_sb, &retries)) { |
1844 | retry: | ||
1845 | if (md_reserved) | ||
1846 | write_inode_now(inode, (retries == 3)); | ||
1832 | yield(); | 1847 | yield(); |
1833 | goto repeat; | 1848 | goto repeat; |
1834 | } | 1849 | } |
1835 | return -ENOSPC; | 1850 | return -ENOSPC; |
1836 | } | 1851 | } |
1837 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | 1852 | spin_lock(&ei->i_block_reservation_lock); |
1838 | EXT4_I(inode)->i_reserved_data_blocks += nrblocks; | 1853 | ei->i_reserved_data_blocks += nrblocks; |
1839 | EXT4_I(inode)->i_reserved_meta_blocks += md_needed; | 1854 | ei->i_reserved_meta_blocks += md_needed; |
1840 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1855 | spin_unlock(&ei->i_block_reservation_lock); |
1841 | 1856 | ||
1842 | return 0; /* success */ | 1857 | return 0; /* success */ |
1843 | } | 1858 | } |
@@ -1845,49 +1860,45 @@ repeat: | |||
1845 | static void ext4_da_release_space(struct inode *inode, int to_free) | 1860 | static void ext4_da_release_space(struct inode *inode, int to_free) |
1846 | { | 1861 | { |
1847 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1862 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1848 | int total, mdb, mdb_free, release; | 1863 | struct ext4_inode_info *ei = EXT4_I(inode); |
1849 | 1864 | ||
1850 | if (!to_free) | 1865 | if (!to_free) |
1851 | return; /* Nothing to release, exit */ | 1866 | return; /* Nothing to release, exit */ |
1852 | 1867 | ||
1853 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | 1868 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); |
1854 | 1869 | ||
1855 | if (!EXT4_I(inode)->i_reserved_data_blocks) { | 1870 | if (unlikely(to_free > ei->i_reserved_data_blocks)) { |
1856 | /* | 1871 | /* |
1857 | * if there is no reserved blocks, but we try to free some | 1872 | * if there aren't enough reserved blocks, then the |
1858 | * then the counter is messed up somewhere. | 1873 | * counter is messed up somewhere. Since this |
1859 | * but since this function is called from invalidate | 1874 | * function is called from invalidate page, it's |
1860 | * page, it's harmless to return without any action | 1875 | * harmless to return without any action. |
1861 | */ | 1876 | */ |
1862 | printk(KERN_INFO "ext4 delalloc try to release %d reserved " | 1877 | ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " |
1863 | "blocks for inode %lu, but there is no reserved " | 1878 | "ino %lu, to_free %d with only %d reserved " |
1864 | "data blocks\n", to_free, inode->i_ino); | 1879 | "data blocks\n", inode->i_ino, to_free, |
1865 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1880 | ei->i_reserved_data_blocks); |
1866 | return; | 1881 | WARN_ON(1); |
1882 | to_free = ei->i_reserved_data_blocks; | ||
1867 | } | 1883 | } |
1884 | ei->i_reserved_data_blocks -= to_free; | ||
1868 | 1885 | ||
1869 | /* recalculate the number of metablocks still need to be reserved */ | 1886 | if (ei->i_reserved_data_blocks == 0) { |
1870 | total = EXT4_I(inode)->i_reserved_data_blocks - to_free; | 1887 | /* |
1871 | mdb = ext4_calc_metadata_amount(inode, total); | 1888 | * We can release all of the reserved metadata blocks |
1872 | 1889 | * only when we have written all of the delayed | |
1873 | /* figure out how many metablocks to release */ | 1890 | * allocation blocks. |
1874 | BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | 1891 | */ |
1875 | mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; | 1892 | to_free += ei->i_allocated_meta_blocks; |
1876 | 1893 | ei->i_allocated_meta_blocks = 0; | |
1877 | release = to_free + mdb_free; | 1894 | } |
1878 | |||
1879 | /* update fs dirty blocks counter for truncate case */ | ||
1880 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); | ||
1881 | 1895 | ||
1882 | /* update per-inode reservations */ | 1896 | /* update fs dirty blocks counter */ |
1883 | BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); | 1897 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); |
1884 | EXT4_I(inode)->i_reserved_data_blocks -= to_free; | ||
1885 | 1898 | ||
1886 | BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | ||
1887 | EXT4_I(inode)->i_reserved_meta_blocks = mdb; | ||
1888 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1899 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); |
1889 | 1900 | ||
1890 | vfs_dq_release_reservation_block(inode, release); | 1901 | vfs_dq_release_reservation_block(inode, to_free); |
1891 | } | 1902 | } |
1892 | 1903 | ||
1893 | static void ext4_da_page_release_reservation(struct page *page, | 1904 | static void ext4_da_page_release_reservation(struct page *page, |