diff options
author | Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | 2008-07-23 14:14:05 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-07-23 14:14:05 -0400 |
commit | 6be2ded1d7c51b39144b9f07d2c839e1bd8707f1 (patch) | |
tree | aed3b1a8a0ebb8d62152a469953d970926988392 /fs/ext4 | |
parent | 1320cbcf771a20b44cf580712b843d213ae75cd3 (diff) |
ext4: Don't allow lg prealloc list to be grow large.
Currently, the locality group prealloc list is freed only when there
is a block allocation failure. This can result in large number of
entries in the preallocation list making ext4_mb_use_preallocated()
expensive.
To fix this, we convert the locality group prealloc list to a hash
list. The hash index is the order of number of blocks in the prealloc
space with a max order of 9. When adding prealloc space to the list we
make sure total entries for each order does not exceed 8. If it is
more than 8 we discard few entries and make sure the we have only <= 5
entries.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/mballoc.c | 213 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 10 |
2 files changed, 193 insertions, 30 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 49bec8404c5f..865e9ddb44d4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -2480,7 +2480,7 @@ err_freesgi: | |||
2480 | int ext4_mb_init(struct super_block *sb, int needs_recovery) | 2480 | int ext4_mb_init(struct super_block *sb, int needs_recovery) |
2481 | { | 2481 | { |
2482 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 2482 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
2483 | unsigned i; | 2483 | unsigned i, j; |
2484 | unsigned offset; | 2484 | unsigned offset; |
2485 | unsigned max; | 2485 | unsigned max; |
2486 | int ret; | 2486 | int ret; |
@@ -2552,7 +2552,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |||
2552 | struct ext4_locality_group *lg; | 2552 | struct ext4_locality_group *lg; |
2553 | lg = &sbi->s_locality_groups[i]; | 2553 | lg = &sbi->s_locality_groups[i]; |
2554 | mutex_init(&lg->lg_mutex); | 2554 | mutex_init(&lg->lg_mutex); |
2555 | INIT_LIST_HEAD(&lg->lg_prealloc_list); | 2555 | for (j = 0; j < PREALLOC_TB_SIZE; j++) |
2556 | INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); | ||
2556 | spin_lock_init(&lg->lg_prealloc_lock); | 2557 | spin_lock_init(&lg->lg_prealloc_lock); |
2557 | } | 2558 | } |
2558 | 2559 | ||
@@ -3263,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, | |||
3263 | struct ext4_prealloc_space *pa) | 3264 | struct ext4_prealloc_space *pa) |
3264 | { | 3265 | { |
3265 | unsigned int len = ac->ac_o_ex.fe_len; | 3266 | unsigned int len = ac->ac_o_ex.fe_len; |
3267 | |||
3266 | ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, | 3268 | ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, |
3267 | &ac->ac_b_ex.fe_group, | 3269 | &ac->ac_b_ex.fe_group, |
3268 | &ac->ac_b_ex.fe_start); | 3270 | &ac->ac_b_ex.fe_start); |
@@ -3285,6 +3287,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, | |||
3285 | static noinline_for_stack int | 3287 | static noinline_for_stack int |
3286 | ext4_mb_use_preallocated(struct ext4_allocation_context *ac) | 3288 | ext4_mb_use_preallocated(struct ext4_allocation_context *ac) |
3287 | { | 3289 | { |
3290 | int order, i; | ||
3288 | struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); | 3291 | struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); |
3289 | struct ext4_locality_group *lg; | 3292 | struct ext4_locality_group *lg; |
3290 | struct ext4_prealloc_space *pa; | 3293 | struct ext4_prealloc_space *pa; |
@@ -3325,22 +3328,29 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) | |||
3325 | lg = ac->ac_lg; | 3328 | lg = ac->ac_lg; |
3326 | if (lg == NULL) | 3329 | if (lg == NULL) |
3327 | return 0; | 3330 | return 0; |
3328 | 3331 | order = fls(ac->ac_o_ex.fe_len) - 1; | |
3329 | rcu_read_lock(); | 3332 | if (order > PREALLOC_TB_SIZE - 1) |
3330 | list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) { | 3333 | /* The max size of hash table is PREALLOC_TB_SIZE */ |
3331 | spin_lock(&pa->pa_lock); | 3334 | order = PREALLOC_TB_SIZE - 1; |
3332 | if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { | 3335 | |
3333 | atomic_inc(&pa->pa_count); | 3336 | for (i = order; i < PREALLOC_TB_SIZE; i++) { |
3334 | ext4_mb_use_group_pa(ac, pa); | 3337 | rcu_read_lock(); |
3338 | list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], | ||
3339 | pa_inode_list) { | ||
3340 | spin_lock(&pa->pa_lock); | ||
3341 | if (pa->pa_deleted == 0 && | ||
3342 | pa->pa_free >= ac->ac_o_ex.fe_len) { | ||
3343 | atomic_inc(&pa->pa_count); | ||
3344 | ext4_mb_use_group_pa(ac, pa); | ||
3345 | spin_unlock(&pa->pa_lock); | ||
3346 | ac->ac_criteria = 20; | ||
3347 | rcu_read_unlock(); | ||
3348 | return 1; | ||
3349 | } | ||
3335 | spin_unlock(&pa->pa_lock); | 3350 | spin_unlock(&pa->pa_lock); |
3336 | ac->ac_criteria = 20; | ||
3337 | rcu_read_unlock(); | ||
3338 | return 1; | ||
3339 | } | 3351 | } |
3340 | spin_unlock(&pa->pa_lock); | 3352 | rcu_read_unlock(); |
3341 | } | 3353 | } |
3342 | rcu_read_unlock(); | ||
3343 | |||
3344 | return 0; | 3354 | return 0; |
3345 | } | 3355 | } |
3346 | 3356 | ||
@@ -3563,6 +3573,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) | |||
3563 | pa->pa_free = pa->pa_len; | 3573 | pa->pa_free = pa->pa_len; |
3564 | atomic_set(&pa->pa_count, 1); | 3574 | atomic_set(&pa->pa_count, 1); |
3565 | spin_lock_init(&pa->pa_lock); | 3575 | spin_lock_init(&pa->pa_lock); |
3576 | INIT_LIST_HEAD(&pa->pa_inode_list); | ||
3566 | pa->pa_deleted = 0; | 3577 | pa->pa_deleted = 0; |
3567 | pa->pa_linear = 1; | 3578 | pa->pa_linear = 1; |
3568 | 3579 | ||
@@ -3583,10 +3594,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) | |||
3583 | list_add(&pa->pa_group_list, &grp->bb_prealloc_list); | 3594 | list_add(&pa->pa_group_list, &grp->bb_prealloc_list); |
3584 | ext4_unlock_group(sb, ac->ac_b_ex.fe_group); | 3595 | ext4_unlock_group(sb, ac->ac_b_ex.fe_group); |
3585 | 3596 | ||
3586 | spin_lock(pa->pa_obj_lock); | 3597 | /* |
3587 | list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list); | 3598 | * We will later add the new pa to the right bucket |
3588 | spin_unlock(pa->pa_obj_lock); | 3599 | * after updating the pa_free in ext4_mb_release_context |
3589 | 3600 | */ | |
3590 | return 0; | 3601 | return 0; |
3591 | } | 3602 | } |
3592 | 3603 | ||
@@ -4123,22 +4134,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, | |||
4123 | 4134 | ||
4124 | } | 4135 | } |
4125 | 4136 | ||
4137 | static noinline_for_stack void | ||
4138 | ext4_mb_discard_lg_preallocations(struct super_block *sb, | ||
4139 | struct ext4_locality_group *lg, | ||
4140 | int order, int total_entries) | ||
4141 | { | ||
4142 | ext4_group_t group = 0; | ||
4143 | struct ext4_buddy e4b; | ||
4144 | struct list_head discard_list; | ||
4145 | struct ext4_prealloc_space *pa, *tmp; | ||
4146 | struct ext4_allocation_context *ac; | ||
4147 | |||
4148 | mb_debug("discard locality group preallocation\n"); | ||
4149 | |||
4150 | INIT_LIST_HEAD(&discard_list); | ||
4151 | ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); | ||
4152 | |||
4153 | spin_lock(&lg->lg_prealloc_lock); | ||
4154 | list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], | ||
4155 | pa_inode_list) { | ||
4156 | spin_lock(&pa->pa_lock); | ||
4157 | if (atomic_read(&pa->pa_count)) { | ||
4158 | /* | ||
4159 | * This is the pa that we just used | ||
4160 | * for block allocation. So don't | ||
4161 | * free that | ||
4162 | */ | ||
4163 | spin_unlock(&pa->pa_lock); | ||
4164 | continue; | ||
4165 | } | ||
4166 | if (pa->pa_deleted) { | ||
4167 | spin_unlock(&pa->pa_lock); | ||
4168 | continue; | ||
4169 | } | ||
4170 | /* only lg prealloc space */ | ||
4171 | BUG_ON(!pa->pa_linear); | ||
4172 | |||
4173 | /* seems this one can be freed ... */ | ||
4174 | pa->pa_deleted = 1; | ||
4175 | spin_unlock(&pa->pa_lock); | ||
4176 | |||
4177 | list_del_rcu(&pa->pa_inode_list); | ||
4178 | list_add(&pa->u.pa_tmp_list, &discard_list); | ||
4179 | |||
4180 | total_entries--; | ||
4181 | if (total_entries <= 5) { | ||
4182 | /* | ||
4183 | * we want to keep only 5 entries | ||
4184 | * allowing it to grow to 8. This | ||
4185 | * mak sure we don't call discard | ||
4186 | * soon for this list. | ||
4187 | */ | ||
4188 | break; | ||
4189 | } | ||
4190 | } | ||
4191 | spin_unlock(&lg->lg_prealloc_lock); | ||
4192 | |||
4193 | list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { | ||
4194 | |||
4195 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); | ||
4196 | if (ext4_mb_load_buddy(sb, group, &e4b)) { | ||
4197 | ext4_error(sb, __func__, "Error in loading buddy " | ||
4198 | "information for %lu\n", group); | ||
4199 | continue; | ||
4200 | } | ||
4201 | ext4_lock_group(sb, group); | ||
4202 | list_del(&pa->pa_group_list); | ||
4203 | ext4_mb_release_group_pa(&e4b, pa, ac); | ||
4204 | ext4_unlock_group(sb, group); | ||
4205 | |||
4206 | ext4_mb_release_desc(&e4b); | ||
4207 | list_del(&pa->u.pa_tmp_list); | ||
4208 | call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); | ||
4209 | } | ||
4210 | if (ac) | ||
4211 | kmem_cache_free(ext4_ac_cachep, ac); | ||
4212 | } | ||
4213 | |||
4214 | /* | ||
4215 | * We have incremented pa_count. So it cannot be freed at this | ||
4216 | * point. Also we hold lg_mutex. So no parallel allocation is | ||
4217 | * possible from this lg. That means pa_free cannot be updated. | ||
4218 | * | ||
4219 | * A parallel ext4_mb_discard_group_preallocations is possible. | ||
4220 | * which can cause the lg_prealloc_list to be updated. | ||
4221 | */ | ||
4222 | |||
4223 | static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) | ||
4224 | { | ||
4225 | int order, added = 0, lg_prealloc_count = 1; | ||
4226 | struct super_block *sb = ac->ac_sb; | ||
4227 | struct ext4_locality_group *lg = ac->ac_lg; | ||
4228 | struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; | ||
4229 | |||
4230 | order = fls(pa->pa_free) - 1; | ||
4231 | if (order > PREALLOC_TB_SIZE - 1) | ||
4232 | /* The max size of hash table is PREALLOC_TB_SIZE */ | ||
4233 | order = PREALLOC_TB_SIZE - 1; | ||
4234 | /* Add the prealloc space to lg */ | ||
4235 | rcu_read_lock(); | ||
4236 | list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], | ||
4237 | pa_inode_list) { | ||
4238 | spin_lock(&tmp_pa->pa_lock); | ||
4239 | if (tmp_pa->pa_deleted) { | ||
4240 | spin_unlock(&pa->pa_lock); | ||
4241 | continue; | ||
4242 | } | ||
4243 | if (!added && pa->pa_free < tmp_pa->pa_free) { | ||
4244 | /* Add to the tail of the previous entry */ | ||
4245 | list_add_tail_rcu(&pa->pa_inode_list, | ||
4246 | &tmp_pa->pa_inode_list); | ||
4247 | added = 1; | ||
4248 | /* | ||
4249 | * we want to count the total | ||
4250 | * number of entries in the list | ||
4251 | */ | ||
4252 | } | ||
4253 | spin_unlock(&tmp_pa->pa_lock); | ||
4254 | lg_prealloc_count++; | ||
4255 | } | ||
4256 | if (!added) | ||
4257 | list_add_tail_rcu(&pa->pa_inode_list, | ||
4258 | &lg->lg_prealloc_list[order]); | ||
4259 | rcu_read_unlock(); | ||
4260 | |||
4261 | /* Now trim the list to be not more than 8 elements */ | ||
4262 | if (lg_prealloc_count > 8) { | ||
4263 | ext4_mb_discard_lg_preallocations(sb, lg, | ||
4264 | order, lg_prealloc_count); | ||
4265 | return; | ||
4266 | } | ||
4267 | return ; | ||
4268 | } | ||
4269 | |||
4126 | /* | 4270 | /* |
4127 | * release all resource we used in allocation | 4271 | * release all resource we used in allocation |
4128 | */ | 4272 | */ |
4129 | static int ext4_mb_release_context(struct ext4_allocation_context *ac) | 4273 | static int ext4_mb_release_context(struct ext4_allocation_context *ac) |
4130 | { | 4274 | { |
4131 | if (ac->ac_pa) { | 4275 | struct ext4_prealloc_space *pa = ac->ac_pa; |
4132 | if (ac->ac_pa->pa_linear) { | 4276 | if (pa) { |
4277 | if (pa->pa_linear) { | ||
4133 | /* see comment in ext4_mb_use_group_pa() */ | 4278 | /* see comment in ext4_mb_use_group_pa() */ |
4134 | spin_lock(&ac->ac_pa->pa_lock); | 4279 | spin_lock(&pa->pa_lock); |
4135 | ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len; | 4280 | pa->pa_pstart += ac->ac_b_ex.fe_len; |
4136 | ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len; | 4281 | pa->pa_lstart += ac->ac_b_ex.fe_len; |
4137 | ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len; | 4282 | pa->pa_free -= ac->ac_b_ex.fe_len; |
4138 | ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len; | 4283 | pa->pa_len -= ac->ac_b_ex.fe_len; |
4139 | spin_unlock(&ac->ac_pa->pa_lock); | 4284 | spin_unlock(&pa->pa_lock); |
4285 | /* | ||
4286 | * We want to add the pa to the right bucket. | ||
4287 | * Remove it from the list and while adding | ||
4288 | * make sure the list to which we are adding | ||
4289 | * doesn't grow big. | ||
4290 | */ | ||
4291 | if (likely(pa->pa_free)) { | ||
4292 | spin_lock(pa->pa_obj_lock); | ||
4293 | list_del_rcu(&pa->pa_inode_list); | ||
4294 | spin_unlock(pa->pa_obj_lock); | ||
4295 | ext4_mb_add_n_trim(ac); | ||
4296 | } | ||
4140 | } | 4297 | } |
4141 | ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa); | 4298 | ext4_mb_put_pa(ac, ac->ac_sb, pa); |
4142 | } | 4299 | } |
4143 | if (ac->ac_bitmap_page) | 4300 | if (ac->ac_bitmap_page) |
4144 | page_cache_release(ac->ac_bitmap_page); | 4301 | page_cache_release(ac->ac_bitmap_page); |
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index bfe6add46bcf..c7c9906c2a75 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h | |||
@@ -164,11 +164,17 @@ struct ext4_free_extent { | |||
164 | * Locality group: | 164 | * Locality group: |
165 | * we try to group all related changes together | 165 | * we try to group all related changes together |
166 | * so that writeback can flush/allocate them together as well | 166 | * so that writeback can flush/allocate them together as well |
167 | * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC | ||
168 | * (512). We store prealloc space into the hash based on the pa_free blocks | ||
169 | * order value.ie, fls(pa_free)-1; | ||
167 | */ | 170 | */ |
171 | #define PREALLOC_TB_SIZE 10 | ||
168 | struct ext4_locality_group { | 172 | struct ext4_locality_group { |
169 | /* for allocator */ | 173 | /* for allocator */ |
170 | struct mutex lg_mutex; /* to serialize allocates */ | 174 | /* to serialize allocates */ |
171 | struct list_head lg_prealloc_list;/* list of preallocations */ | 175 | struct mutex lg_mutex; |
176 | /* list of preallocations */ | ||
177 | struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; | ||
172 | spinlock_t lg_prealloc_lock; | 178 | spinlock_t lg_prealloc_lock; |
173 | }; | 179 | }; |
174 | 180 | ||