aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>2008-07-23 14:14:05 -0400
committerTheodore Ts'o <tytso@mit.edu>2008-07-23 14:14:05 -0400
commit6be2ded1d7c51b39144b9f07d2c839e1bd8707f1 (patch)
treeaed3b1a8a0ebb8d62152a469953d970926988392
parent1320cbcf771a20b44cf580712b843d213ae75cd3 (diff)
ext4: Don't allow lg prealloc list to be grow large.
Currently, the locality group prealloc list is freed only when there is a block allocation failure. This can result in large number of entries in the preallocation list making ext4_mb_use_preallocated() expensive. To fix this, we convert the locality group prealloc list to a hash list. The hash index is the order of number of blocks in the prealloc space with a max order of 9. When adding prealloc space to the list we make sure total entries for each order does not exceed 8. If it is more than 8 we discard few entries and make sure the we have only <= 5 entries. Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
-rw-r--r--fs/ext4/mballoc.c213
-rw-r--r--fs/ext4/mballoc.h10
2 files changed, 193 insertions, 30 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 49bec8404c5f..865e9ddb44d4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2480,7 +2480,7 @@ err_freesgi:
2480int ext4_mb_init(struct super_block *sb, int needs_recovery) 2480int ext4_mb_init(struct super_block *sb, int needs_recovery)
2481{ 2481{
2482 struct ext4_sb_info *sbi = EXT4_SB(sb); 2482 struct ext4_sb_info *sbi = EXT4_SB(sb);
2483 unsigned i; 2483 unsigned i, j;
2484 unsigned offset; 2484 unsigned offset;
2485 unsigned max; 2485 unsigned max;
2486 int ret; 2486 int ret;
@@ -2552,7 +2552,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2552 struct ext4_locality_group *lg; 2552 struct ext4_locality_group *lg;
2553 lg = &sbi->s_locality_groups[i]; 2553 lg = &sbi->s_locality_groups[i];
2554 mutex_init(&lg->lg_mutex); 2554 mutex_init(&lg->lg_mutex);
2555 INIT_LIST_HEAD(&lg->lg_prealloc_list); 2555 for (j = 0; j < PREALLOC_TB_SIZE; j++)
2556 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
2556 spin_lock_init(&lg->lg_prealloc_lock); 2557 spin_lock_init(&lg->lg_prealloc_lock);
2557 } 2558 }
2558 2559
@@ -3263,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3263 struct ext4_prealloc_space *pa) 3264 struct ext4_prealloc_space *pa)
3264{ 3265{
3265 unsigned int len = ac->ac_o_ex.fe_len; 3266 unsigned int len = ac->ac_o_ex.fe_len;
3267
3266 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, 3268 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3267 &ac->ac_b_ex.fe_group, 3269 &ac->ac_b_ex.fe_group,
3268 &ac->ac_b_ex.fe_start); 3270 &ac->ac_b_ex.fe_start);
@@ -3285,6 +3287,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3285static noinline_for_stack int 3287static noinline_for_stack int
3286ext4_mb_use_preallocated(struct ext4_allocation_context *ac) 3288ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3287{ 3289{
3290 int order, i;
3288 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3291 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3289 struct ext4_locality_group *lg; 3292 struct ext4_locality_group *lg;
3290 struct ext4_prealloc_space *pa; 3293 struct ext4_prealloc_space *pa;
@@ -3325,22 +3328,29 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3325 lg = ac->ac_lg; 3328 lg = ac->ac_lg;
3326 if (lg == NULL) 3329 if (lg == NULL)
3327 return 0; 3330 return 0;
3328 3331 order = fls(ac->ac_o_ex.fe_len) - 1;
3329 rcu_read_lock(); 3332 if (order > PREALLOC_TB_SIZE - 1)
3330 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) { 3333 /* The max size of hash table is PREALLOC_TB_SIZE */
3331 spin_lock(&pa->pa_lock); 3334 order = PREALLOC_TB_SIZE - 1;
3332 if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { 3335
3333 atomic_inc(&pa->pa_count); 3336 for (i = order; i < PREALLOC_TB_SIZE; i++) {
3334 ext4_mb_use_group_pa(ac, pa); 3337 rcu_read_lock();
3338 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
3339 pa_inode_list) {
3340 spin_lock(&pa->pa_lock);
3341 if (pa->pa_deleted == 0 &&
3342 pa->pa_free >= ac->ac_o_ex.fe_len) {
3343 atomic_inc(&pa->pa_count);
3344 ext4_mb_use_group_pa(ac, pa);
3345 spin_unlock(&pa->pa_lock);
3346 ac->ac_criteria = 20;
3347 rcu_read_unlock();
3348 return 1;
3349 }
3335 spin_unlock(&pa->pa_lock); 3350 spin_unlock(&pa->pa_lock);
3336 ac->ac_criteria = 20;
3337 rcu_read_unlock();
3338 return 1;
3339 } 3351 }
3340 spin_unlock(&pa->pa_lock); 3352 rcu_read_unlock();
3341 } 3353 }
3342 rcu_read_unlock();
3343
3344 return 0; 3354 return 0;
3345} 3355}
3346 3356
@@ -3563,6 +3573,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3563 pa->pa_free = pa->pa_len; 3573 pa->pa_free = pa->pa_len;
3564 atomic_set(&pa->pa_count, 1); 3574 atomic_set(&pa->pa_count, 1);
3565 spin_lock_init(&pa->pa_lock); 3575 spin_lock_init(&pa->pa_lock);
3576 INIT_LIST_HEAD(&pa->pa_inode_list);
3566 pa->pa_deleted = 0; 3577 pa->pa_deleted = 0;
3567 pa->pa_linear = 1; 3578 pa->pa_linear = 1;
3568 3579
@@ -3583,10 +3594,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3583 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 3594 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3584 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3595 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3585 3596
3586 spin_lock(pa->pa_obj_lock); 3597 /*
3587 list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list); 3598 * We will later add the new pa to the right bucket
3588 spin_unlock(pa->pa_obj_lock); 3599 * after updating the pa_free in ext4_mb_release_context
3589 3600 */
3590 return 0; 3601 return 0;
3591} 3602}
3592 3603
@@ -4123,22 +4134,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4123 4134
4124} 4135}
4125 4136
4137static noinline_for_stack void
4138ext4_mb_discard_lg_preallocations(struct super_block *sb,
4139 struct ext4_locality_group *lg,
4140 int order, int total_entries)
4141{
4142 ext4_group_t group = 0;
4143 struct ext4_buddy e4b;
4144 struct list_head discard_list;
4145 struct ext4_prealloc_space *pa, *tmp;
4146 struct ext4_allocation_context *ac;
4147
4148 mb_debug("discard locality group preallocation\n");
4149
4150 INIT_LIST_HEAD(&discard_list);
4151 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4152
4153 spin_lock(&lg->lg_prealloc_lock);
4154 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4155 pa_inode_list) {
4156 spin_lock(&pa->pa_lock);
4157 if (atomic_read(&pa->pa_count)) {
4158 /*
4159 * This is the pa that we just used
4160 * for block allocation. So don't
4161 * free that
4162 */
4163 spin_unlock(&pa->pa_lock);
4164 continue;
4165 }
4166 if (pa->pa_deleted) {
4167 spin_unlock(&pa->pa_lock);
4168 continue;
4169 }
4170 /* only lg prealloc space */
4171 BUG_ON(!pa->pa_linear);
4172
4173 /* seems this one can be freed ... */
4174 pa->pa_deleted = 1;
4175 spin_unlock(&pa->pa_lock);
4176
4177 list_del_rcu(&pa->pa_inode_list);
4178 list_add(&pa->u.pa_tmp_list, &discard_list);
4179
4180 total_entries--;
4181 if (total_entries <= 5) {
4182 /*
4183 * we want to keep only 5 entries
4184 * allowing it to grow to 8. This
4185 * mak sure we don't call discard
4186 * soon for this list.
4187 */
4188 break;
4189 }
4190 }
4191 spin_unlock(&lg->lg_prealloc_lock);
4192
4193 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4194
4195 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4196 if (ext4_mb_load_buddy(sb, group, &e4b)) {
4197 ext4_error(sb, __func__, "Error in loading buddy "
4198 "information for %lu\n", group);
4199 continue;
4200 }
4201 ext4_lock_group(sb, group);
4202 list_del(&pa->pa_group_list);
4203 ext4_mb_release_group_pa(&e4b, pa, ac);
4204 ext4_unlock_group(sb, group);
4205
4206 ext4_mb_release_desc(&e4b);
4207 list_del(&pa->u.pa_tmp_list);
4208 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4209 }
4210 if (ac)
4211 kmem_cache_free(ext4_ac_cachep, ac);
4212}
4213
4214/*
4215 * We have incremented pa_count. So it cannot be freed at this
4216 * point. Also we hold lg_mutex. So no parallel allocation is
4217 * possible from this lg. That means pa_free cannot be updated.
4218 *
4219 * A parallel ext4_mb_discard_group_preallocations is possible.
4220 * which can cause the lg_prealloc_list to be updated.
4221 */
4222
4223static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4224{
4225 int order, added = 0, lg_prealloc_count = 1;
4226 struct super_block *sb = ac->ac_sb;
4227 struct ext4_locality_group *lg = ac->ac_lg;
4228 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
4229
4230 order = fls(pa->pa_free) - 1;
4231 if (order > PREALLOC_TB_SIZE - 1)
4232 /* The max size of hash table is PREALLOC_TB_SIZE */
4233 order = PREALLOC_TB_SIZE - 1;
4234 /* Add the prealloc space to lg */
4235 rcu_read_lock();
4236 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4237 pa_inode_list) {
4238 spin_lock(&tmp_pa->pa_lock);
4239 if (tmp_pa->pa_deleted) {
4240 spin_unlock(&pa->pa_lock);
4241 continue;
4242 }
4243 if (!added && pa->pa_free < tmp_pa->pa_free) {
4244 /* Add to the tail of the previous entry */
4245 list_add_tail_rcu(&pa->pa_inode_list,
4246 &tmp_pa->pa_inode_list);
4247 added = 1;
4248 /*
4249 * we want to count the total
4250 * number of entries in the list
4251 */
4252 }
4253 spin_unlock(&tmp_pa->pa_lock);
4254 lg_prealloc_count++;
4255 }
4256 if (!added)
4257 list_add_tail_rcu(&pa->pa_inode_list,
4258 &lg->lg_prealloc_list[order]);
4259 rcu_read_unlock();
4260
4261 /* Now trim the list to be not more than 8 elements */
4262 if (lg_prealloc_count > 8) {
4263 ext4_mb_discard_lg_preallocations(sb, lg,
4264 order, lg_prealloc_count);
4265 return;
4266 }
4267 return ;
4268}
4269
4126/* 4270/*
4127 * release all resource we used in allocation 4271 * release all resource we used in allocation
4128 */ 4272 */
4129static int ext4_mb_release_context(struct ext4_allocation_context *ac) 4273static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4130{ 4274{
4131 if (ac->ac_pa) { 4275 struct ext4_prealloc_space *pa = ac->ac_pa;
4132 if (ac->ac_pa->pa_linear) { 4276 if (pa) {
4277 if (pa->pa_linear) {
4133 /* see comment in ext4_mb_use_group_pa() */ 4278 /* see comment in ext4_mb_use_group_pa() */
4134 spin_lock(&ac->ac_pa->pa_lock); 4279 spin_lock(&pa->pa_lock);
4135 ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len; 4280 pa->pa_pstart += ac->ac_b_ex.fe_len;
4136 ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len; 4281 pa->pa_lstart += ac->ac_b_ex.fe_len;
4137 ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len; 4282 pa->pa_free -= ac->ac_b_ex.fe_len;
4138 ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len; 4283 pa->pa_len -= ac->ac_b_ex.fe_len;
4139 spin_unlock(&ac->ac_pa->pa_lock); 4284 spin_unlock(&pa->pa_lock);
4285 /*
4286 * We want to add the pa to the right bucket.
4287 * Remove it from the list and while adding
4288 * make sure the list to which we are adding
4289 * doesn't grow big.
4290 */
4291 if (likely(pa->pa_free)) {
4292 spin_lock(pa->pa_obj_lock);
4293 list_del_rcu(&pa->pa_inode_list);
4294 spin_unlock(pa->pa_obj_lock);
4295 ext4_mb_add_n_trim(ac);
4296 }
4140 } 4297 }
4141 ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa); 4298 ext4_mb_put_pa(ac, ac->ac_sb, pa);
4142 } 4299 }
4143 if (ac->ac_bitmap_page) 4300 if (ac->ac_bitmap_page)
4144 page_cache_release(ac->ac_bitmap_page); 4301 page_cache_release(ac->ac_bitmap_page);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index bfe6add46bcf..c7c9906c2a75 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -164,11 +164,17 @@ struct ext4_free_extent {
164 * Locality group: 164 * Locality group:
165 * we try to group all related changes together 165 * we try to group all related changes together
166 * so that writeback can flush/allocate them together as well 166 * so that writeback can flush/allocate them together as well
167 * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
168 * (512). We store prealloc space into the hash based on the pa_free blocks
169 * order value.ie, fls(pa_free)-1;
167 */ 170 */
171#define PREALLOC_TB_SIZE 10
168struct ext4_locality_group { 172struct ext4_locality_group {
169 /* for allocator */ 173 /* for allocator */
170 struct mutex lg_mutex; /* to serialize allocates */ 174 /* to serialize allocates */
171 struct list_head lg_prealloc_list;/* list of preallocations */ 175 struct mutex lg_mutex;
176 /* list of preallocations */
177 struct list_head lg_prealloc_list[PREALLOC_TB_SIZE];
172 spinlock_t lg_prealloc_lock; 178 spinlock_t lg_prealloc_lock;
173}; 179};
174 180