diff options
author | Eric Sandeen <sandeen@redhat.com> | 2010-05-16 11:00:00 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2010-05-16 11:00:00 -0400 |
commit | 72b8ab9dde211ea518ff27e631b2046ef90c29a2 (patch) | |
tree | f63fc1894bcc39c81060e2fd8b21bedd07a1e958 /fs/ext4/inode.c | |
parent | 0e05842bc117ea70ceb979cca798fd026879951b (diff) |
ext4: don't use quota reservation for speculative metadata
Because we can badly over-reserve metadata when we
calculate worst-case, it complicates things for quota, since
we must reserve and then claim later, retry on EDQUOT, etc.
Quota is also a generally smaller pool than fs free blocks,
so this over-reservation hurts more, and more often.
I'm of the opinion that it's not the worst thing to allow
metadata to push a user slightly over quota. This simplifies
the code and avoids the false quota rejections that result
from worst-case speculation.
This patch stops the speculative quota-charging for
worst-case metadata requirements, and just charges quota
when the blocks are allocated at writeout. It also is
able to remove the try-again loop on EDQUOT.
This patch has been tested indirectly by running the xfstests
suite with a hack to mount & enable quota prior to the test.
I also did a more specific test of fragmenting freespace
and then doing a large delalloc write under quota; quota
stopped me at the right amount of file IO, and then the
writeout generated enough metadata (due to the fragmentation)
that it put me slightly over quota, as expected.
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 69 |
1 files changed, 23 insertions, 46 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 89a31e8869cc..df43217e4e72 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -1076,7 +1076,6 @@ void ext4_da_update_reserve_space(struct inode *inode, | |||
1076 | { | 1076 | { |
1077 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1077 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1078 | struct ext4_inode_info *ei = EXT4_I(inode); | 1078 | struct ext4_inode_info *ei = EXT4_I(inode); |
1079 | int mdb_free = 0, allocated_meta_blocks = 0; | ||
1080 | 1079 | ||
1081 | spin_lock(&ei->i_block_reservation_lock); | 1080 | spin_lock(&ei->i_block_reservation_lock); |
1082 | trace_ext4_da_update_reserve_space(inode, used); | 1081 | trace_ext4_da_update_reserve_space(inode, used); |
@@ -1091,11 +1090,10 @@ void ext4_da_update_reserve_space(struct inode *inode, | |||
1091 | 1090 | ||
1092 | /* Update per-inode reservations */ | 1091 | /* Update per-inode reservations */ |
1093 | ei->i_reserved_data_blocks -= used; | 1092 | ei->i_reserved_data_blocks -= used; |
1094 | used += ei->i_allocated_meta_blocks; | ||
1095 | ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; | 1093 | ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; |
1096 | allocated_meta_blocks = ei->i_allocated_meta_blocks; | 1094 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, |
1095 | used + ei->i_allocated_meta_blocks); | ||
1097 | ei->i_allocated_meta_blocks = 0; | 1096 | ei->i_allocated_meta_blocks = 0; |
1098 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, used); | ||
1099 | 1097 | ||
1100 | if (ei->i_reserved_data_blocks == 0) { | 1098 | if (ei->i_reserved_data_blocks == 0) { |
1101 | /* | 1099 | /* |
@@ -1103,31 +1101,23 @@ void ext4_da_update_reserve_space(struct inode *inode, | |||
1103 | * only when we have written all of the delayed | 1101 | * only when we have written all of the delayed |
1104 | * allocation blocks. | 1102 | * allocation blocks. |
1105 | */ | 1103 | */ |
1106 | mdb_free = ei->i_reserved_meta_blocks; | 1104 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, |
1105 | ei->i_reserved_meta_blocks); | ||
1107 | ei->i_reserved_meta_blocks = 0; | 1106 | ei->i_reserved_meta_blocks = 0; |
1108 | ei->i_da_metadata_calc_len = 0; | 1107 | ei->i_da_metadata_calc_len = 0; |
1109 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); | ||
1110 | } | 1108 | } |
1111 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1109 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); |
1112 | 1110 | ||
1113 | /* Update quota subsystem */ | 1111 | /* Update quota subsystem for data blocks */ |
1114 | if (quota_claim) { | 1112 | if (quota_claim) |
1115 | dquot_claim_block(inode, used); | 1113 | dquot_claim_block(inode, used); |
1116 | if (mdb_free) | 1114 | else { |
1117 | dquot_release_reservation_block(inode, mdb_free); | ||
1118 | } else { | ||
1119 | /* | 1115 | /* |
1120 | * We did fallocate with an offset that is already delayed | 1116 | * We did fallocate with an offset that is already delayed |
1121 | * allocated. So on delayed allocated writeback we should | 1117 | * allocated. So on delayed allocated writeback we should |
1122 | * not update the quota for allocated blocks. But then | 1118 | * not re-claim the quota for fallocated blocks. |
1123 | * converting an fallocate region to initialized region would | ||
1124 | * have caused a metadata allocation. So claim quota for | ||
1125 | * that | ||
1126 | */ | 1119 | */ |
1127 | if (allocated_meta_blocks) | 1120 | dquot_release_reservation_block(inode, used); |
1128 | dquot_claim_block(inode, allocated_meta_blocks); | ||
1129 | dquot_release_reservation_block(inode, mdb_free + used - | ||
1130 | allocated_meta_blocks); | ||
1131 | } | 1121 | } |
1132 | 1122 | ||
1133 | /* | 1123 | /* |
@@ -1861,7 +1851,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) | |||
1861 | int retries = 0; | 1851 | int retries = 0; |
1862 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1852 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1863 | struct ext4_inode_info *ei = EXT4_I(inode); | 1853 | struct ext4_inode_info *ei = EXT4_I(inode); |
1864 | unsigned long md_needed, md_reserved; | 1854 | unsigned long md_needed; |
1865 | int ret; | 1855 | int ret; |
1866 | 1856 | ||
1867 | /* | 1857 | /* |
@@ -1871,22 +1861,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) | |||
1871 | */ | 1861 | */ |
1872 | repeat: | 1862 | repeat: |
1873 | spin_lock(&ei->i_block_reservation_lock); | 1863 | spin_lock(&ei->i_block_reservation_lock); |
1874 | md_reserved = ei->i_reserved_meta_blocks; | ||
1875 | md_needed = ext4_calc_metadata_amount(inode, lblock); | 1864 | md_needed = ext4_calc_metadata_amount(inode, lblock); |
1876 | trace_ext4_da_reserve_space(inode, md_needed); | 1865 | trace_ext4_da_reserve_space(inode, md_needed); |
1877 | spin_unlock(&ei->i_block_reservation_lock); | 1866 | spin_unlock(&ei->i_block_reservation_lock); |
1878 | 1867 | ||
1879 | /* | 1868 | /* |
1880 | * Make quota reservation here to prevent quota overflow | 1869 | * We will charge metadata quota at writeout time; this saves |
1881 | * later. Real quota accounting is done at pages writeout | 1870 | * us from metadata over-estimation, though we may go over by |
1882 | * time. | 1871 | * a small amount in the end. Here we just reserve for data. |
1883 | */ | 1872 | */ |
1884 | ret = dquot_reserve_block(inode, md_needed + 1); | 1873 | ret = dquot_reserve_block(inode, 1); |
1885 | if (ret) | 1874 | if (ret) |
1886 | return ret; | 1875 | return ret; |
1887 | 1876 | /* | |
1877 | * We do still charge estimated metadata to the sb though; | ||
1878 | * we cannot afford to run out of free blocks. | ||
1879 | */ | ||
1888 | if (ext4_claim_free_blocks(sbi, md_needed + 1)) { | 1880 | if (ext4_claim_free_blocks(sbi, md_needed + 1)) { |
1889 | dquot_release_reservation_block(inode, md_needed + 1); | 1881 | dquot_release_reservation_block(inode, 1); |
1890 | if (ext4_should_retry_alloc(inode->i_sb, &retries)) { | 1882 | if (ext4_should_retry_alloc(inode->i_sb, &retries)) { |
1891 | yield(); | 1883 | yield(); |
1892 | goto repeat; | 1884 | goto repeat; |
@@ -1933,12 +1925,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free) | |||
1933 | * only when we have written all of the delayed | 1925 | * only when we have written all of the delayed |
1934 | * allocation blocks. | 1926 | * allocation blocks. |
1935 | */ | 1927 | */ |
1936 | to_free += ei->i_reserved_meta_blocks; | 1928 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, |
1929 | ei->i_reserved_meta_blocks); | ||
1937 | ei->i_reserved_meta_blocks = 0; | 1930 | ei->i_reserved_meta_blocks = 0; |
1938 | ei->i_da_metadata_calc_len = 0; | 1931 | ei->i_da_metadata_calc_len = 0; |
1939 | } | 1932 | } |
1940 | 1933 | ||
1941 | /* update fs dirty blocks counter */ | 1934 | /* update fs dirty data blocks counter */ |
1942 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); | 1935 | percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); |
1943 | 1936 | ||
1944 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | 1937 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); |
@@ -3086,7 +3079,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, | |||
3086 | loff_t pos, unsigned len, unsigned flags, | 3079 | loff_t pos, unsigned len, unsigned flags, |
3087 | struct page **pagep, void **fsdata) | 3080 | struct page **pagep, void **fsdata) |
3088 | { | 3081 | { |
3089 | int ret, retries = 0, quota_retries = 0; | 3082 | int ret, retries = 0; |
3090 | struct page *page; | 3083 | struct page *page; |
3091 | pgoff_t index; | 3084 | pgoff_t index; |
3092 | unsigned from, to; | 3085 | unsigned from, to; |
@@ -3145,22 +3138,6 @@ retry: | |||
3145 | 3138 | ||
3146 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | 3139 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) |
3147 | goto retry; | 3140 | goto retry; |
3148 | |||
3149 | if ((ret == -EDQUOT) && | ||
3150 | EXT4_I(inode)->i_reserved_meta_blocks && | ||
3151 | (quota_retries++ < 3)) { | ||
3152 | /* | ||
3153 | * Since we often over-estimate the number of meta | ||
3154 | * data blocks required, we may sometimes get a | ||
3155 | * spurios out of quota error even though there would | ||
3156 | * be enough space once we write the data blocks and | ||
3157 | * find out how many meta data blocks were _really_ | ||
3158 | * required. So try forcing the inode write to see if | ||
3159 | * that helps. | ||
3160 | */ | ||
3161 | write_inode_now(inode, (quota_retries == 3)); | ||
3162 | goto retry; | ||
3163 | } | ||
3164 | out: | 3141 | out: |
3165 | return ret; | 3142 | return ret; |
3166 | } | 3143 | } |