aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Sandeen <sandeen@redhat.com>2010-05-16 11:00:00 -0400
committerTheodore Ts'o <tytso@mit.edu>2010-05-16 11:00:00 -0400
commit72b8ab9dde211ea518ff27e631b2046ef90c29a2 (patch)
treef63fc1894bcc39c81060e2fd8b21bedd07a1e958
parent0e05842bc117ea70ceb979cca798fd026879951b (diff)
ext4: don't use quota reservation for speculative metadata
Because we can badly over-reserve metadata when we calculate worst-case, it complicates things for quota, since we must reserve and then claim later, retry on EDQUOT, etc. Quota is also a generally smaller pool than fs free blocks, so this over-reservation hurts more, and more often. I'm of the opinion that it's not the worst thing to allow metadata to push a user slightly over quota. This simplifies the code and avoids the false quota rejections that result from worst-case speculation. This patch stops the speculative quota-charging for worst-case metadata requirements, and just charges quota when the blocks are allocated at writeout. It also is able to remove the try-again loop on EDQUOT. This patch has been tested indirectly by running the xfstests suite with a hack to mount & enable quota prior to the test. I also did a more specific test of fragmenting freespace and then doing a large delalloc write under quota; quota stopped me at the right amount of file IO, and then the writeout generated enough metadata (due to the fragmentation) that it put me slightly over quota, as expected. Signed-off-by: Eric Sandeen <sandeen@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/ext4/balloc.c5
-rw-r--r--fs/ext4/inode.c69
2 files changed, 26 insertions, 48 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5516c7..95b7594c76f9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
591 ret = ext4_mb_new_blocks(handle, &ar, errp); 591 ret = ext4_mb_new_blocks(handle, &ar, errp);
592 if (count) 592 if (count)
593 *count = ar.len; 593 *count = ar.len;
594
595 /* 594 /*
596 * Account for the allocated meta blocks 595 * Account for the allocated meta blocks. We will never
596 * fail EDQUOT for metdata, but we do account for it.
597 */ 597 */
598 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 598 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
599 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 599 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
600 EXT4_I(inode)->i_allocated_meta_blocks += ar.len; 600 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
601 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 601 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
602 dquot_alloc_block_nofail(inode, ar.len);
602 } 603 }
603 return ret; 604 return ret;
604} 605}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 89a31e8869cc..df43217e4e72 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1076,7 +1076,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
1076{ 1076{
1077 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1077 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1078 struct ext4_inode_info *ei = EXT4_I(inode); 1078 struct ext4_inode_info *ei = EXT4_I(inode);
1079 int mdb_free = 0, allocated_meta_blocks = 0;
1080 1079
1081 spin_lock(&ei->i_block_reservation_lock); 1080 spin_lock(&ei->i_block_reservation_lock);
1082 trace_ext4_da_update_reserve_space(inode, used); 1081 trace_ext4_da_update_reserve_space(inode, used);
@@ -1091,11 +1090,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
1091 1090
1092 /* Update per-inode reservations */ 1091 /* Update per-inode reservations */
1093 ei->i_reserved_data_blocks -= used; 1092 ei->i_reserved_data_blocks -= used;
1094 used += ei->i_allocated_meta_blocks;
1095 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; 1093 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1096 allocated_meta_blocks = ei->i_allocated_meta_blocks; 1094 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1095 used + ei->i_allocated_meta_blocks);
1097 ei->i_allocated_meta_blocks = 0; 1096 ei->i_allocated_meta_blocks = 0;
1098 percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
1099 1097
1100 if (ei->i_reserved_data_blocks == 0) { 1098 if (ei->i_reserved_data_blocks == 0) {
1101 /* 1099 /*
@@ -1103,31 +1101,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
1103 * only when we have written all of the delayed 1101 * only when we have written all of the delayed
1104 * allocation blocks. 1102 * allocation blocks.
1105 */ 1103 */
1106 mdb_free = ei->i_reserved_meta_blocks; 1104 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1105 ei->i_reserved_meta_blocks);
1107 ei->i_reserved_meta_blocks = 0; 1106 ei->i_reserved_meta_blocks = 0;
1108 ei->i_da_metadata_calc_len = 0; 1107 ei->i_da_metadata_calc_len = 0;
1109 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1110 } 1108 }
1111 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1109 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1112 1110
1113 /* Update quota subsystem */ 1111 /* Update quota subsystem for data blocks */
1114 if (quota_claim) { 1112 if (quota_claim)
1115 dquot_claim_block(inode, used); 1113 dquot_claim_block(inode, used);
1116 if (mdb_free) 1114 else {
1117 dquot_release_reservation_block(inode, mdb_free);
1118 } else {
1119 /* 1115 /*
1120 * We did fallocate with an offset that is already delayed 1116 * We did fallocate with an offset that is already delayed
1121 * allocated. So on delayed allocated writeback we should 1117 * allocated. So on delayed allocated writeback we should
1122 * not update the quota for allocated blocks. But then 1118 * not re-claim the quota for fallocated blocks.
1123 * converting an fallocate region to initialized region would
1124 * have caused a metadata allocation. So claim quota for
1125 * that
1126 */ 1119 */
1127 if (allocated_meta_blocks) 1120 dquot_release_reservation_block(inode, used);
1128 dquot_claim_block(inode, allocated_meta_blocks);
1129 dquot_release_reservation_block(inode, mdb_free + used -
1130 allocated_meta_blocks);
1131 } 1121 }
1132 1122
1133 /* 1123 /*
@@ -1861,7 +1851,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1861 int retries = 0; 1851 int retries = 0;
1862 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1852 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1863 struct ext4_inode_info *ei = EXT4_I(inode); 1853 struct ext4_inode_info *ei = EXT4_I(inode);
1864 unsigned long md_needed, md_reserved; 1854 unsigned long md_needed;
1865 int ret; 1855 int ret;
1866 1856
1867 /* 1857 /*
@@ -1871,22 +1861,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1871 */ 1861 */
1872repeat: 1862repeat:
1873 spin_lock(&ei->i_block_reservation_lock); 1863 spin_lock(&ei->i_block_reservation_lock);
1874 md_reserved = ei->i_reserved_meta_blocks;
1875 md_needed = ext4_calc_metadata_amount(inode, lblock); 1864 md_needed = ext4_calc_metadata_amount(inode, lblock);
1876 trace_ext4_da_reserve_space(inode, md_needed); 1865 trace_ext4_da_reserve_space(inode, md_needed);
1877 spin_unlock(&ei->i_block_reservation_lock); 1866 spin_unlock(&ei->i_block_reservation_lock);
1878 1867
1879 /* 1868 /*
1880 * Make quota reservation here to prevent quota overflow 1869 * We will charge metadata quota at writeout time; this saves
1881 * later. Real quota accounting is done at pages writeout 1870 * us from metadata over-estimation, though we may go over by
1882 * time. 1871 * a small amount in the end. Here we just reserve for data.
1883 */ 1872 */
1884 ret = dquot_reserve_block(inode, md_needed + 1); 1873 ret = dquot_reserve_block(inode, 1);
1885 if (ret) 1874 if (ret)
1886 return ret; 1875 return ret;
1887 1876 /*
1877 * We do still charge estimated metadata to the sb though;
1878 * we cannot afford to run out of free blocks.
1879 */
1888 if (ext4_claim_free_blocks(sbi, md_needed + 1)) { 1880 if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
1889 dquot_release_reservation_block(inode, md_needed + 1); 1881 dquot_release_reservation_block(inode, 1);
1890 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1882 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1891 yield(); 1883 yield();
1892 goto repeat; 1884 goto repeat;
@@ -1933,12 +1925,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1933 * only when we have written all of the delayed 1925 * only when we have written all of the delayed
1934 * allocation blocks. 1926 * allocation blocks.
1935 */ 1927 */
1936 to_free += ei->i_reserved_meta_blocks; 1928 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1929 ei->i_reserved_meta_blocks);
1937 ei->i_reserved_meta_blocks = 0; 1930 ei->i_reserved_meta_blocks = 0;
1938 ei->i_da_metadata_calc_len = 0; 1931 ei->i_da_metadata_calc_len = 0;
1939 } 1932 }
1940 1933
1941 /* update fs dirty blocks counter */ 1934 /* update fs dirty data blocks counter */
1942 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); 1935 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1943 1936
1944 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1937 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -3086,7 +3079,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3086 loff_t pos, unsigned len, unsigned flags, 3079 loff_t pos, unsigned len, unsigned flags,
3087 struct page **pagep, void **fsdata) 3080 struct page **pagep, void **fsdata)
3088{ 3081{
3089 int ret, retries = 0, quota_retries = 0; 3082 int ret, retries = 0;
3090 struct page *page; 3083 struct page *page;
3091 pgoff_t index; 3084 pgoff_t index;
3092 unsigned from, to; 3085 unsigned from, to;
@@ -3145,22 +3138,6 @@ retry:
3145 3138
3146 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3139 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3147 goto retry; 3140 goto retry;
3148
3149 if ((ret == -EDQUOT) &&
3150 EXT4_I(inode)->i_reserved_meta_blocks &&
3151 (quota_retries++ < 3)) {
3152 /*
3153 * Since we often over-estimate the number of meta
3154 * data blocks required, we may sometimes get a
3155 * spurios out of quota error even though there would
3156 * be enough space once we write the data blocks and
3157 * find out how many meta data blocks were _really_
3158 * required. So try forcing the inode write to see if
3159 * that helps.
3160 */
3161 write_inode_now(inode, (quota_retries == 3));
3162 goto retry;
3163 }
3164out: 3141out:
3165 return ret; 3142 return ret;
3166} 3143}