aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/mballoc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r--fs/ext4/mballoc.c254
1 files changed, 213 insertions, 41 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8d141a25bbee..865e9ddb44d4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -787,13 +787,16 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
787 if (bh_uptodate_or_lock(bh[i])) 787 if (bh_uptodate_or_lock(bh[i]))
788 continue; 788 continue;
789 789
790 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
790 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 791 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
791 ext4_init_block_bitmap(sb, bh[i], 792 ext4_init_block_bitmap(sb, bh[i],
792 first_group + i, desc); 793 first_group + i, desc);
793 set_buffer_uptodate(bh[i]); 794 set_buffer_uptodate(bh[i]);
794 unlock_buffer(bh[i]); 795 unlock_buffer(bh[i]);
796 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
795 continue; 797 continue;
796 } 798 }
799 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
797 get_bh(bh[i]); 800 get_bh(bh[i]);
798 bh[i]->b_end_io = end_buffer_read_sync; 801 bh[i]->b_end_io = end_buffer_read_sync;
799 submit_bh(READ, bh[i]); 802 submit_bh(READ, bh[i]);
@@ -2477,7 +2480,7 @@ err_freesgi:
2477int ext4_mb_init(struct super_block *sb, int needs_recovery) 2480int ext4_mb_init(struct super_block *sb, int needs_recovery)
2478{ 2481{
2479 struct ext4_sb_info *sbi = EXT4_SB(sb); 2482 struct ext4_sb_info *sbi = EXT4_SB(sb);
2480 unsigned i; 2483 unsigned i, j;
2481 unsigned offset; 2484 unsigned offset;
2482 unsigned max; 2485 unsigned max;
2483 int ret; 2486 int ret;
@@ -2537,7 +2540,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2537 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; 2540 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2538 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2541 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2539 2542
2540 i = sizeof(struct ext4_locality_group) * NR_CPUS; 2543 i = sizeof(struct ext4_locality_group) * nr_cpu_ids;
2541 sbi->s_locality_groups = kmalloc(i, GFP_KERNEL); 2544 sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
2542 if (sbi->s_locality_groups == NULL) { 2545 if (sbi->s_locality_groups == NULL) {
2543 clear_opt(sbi->s_mount_opt, MBALLOC); 2546 clear_opt(sbi->s_mount_opt, MBALLOC);
@@ -2545,11 +2548,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2545 kfree(sbi->s_mb_maxs); 2548 kfree(sbi->s_mb_maxs);
2546 return -ENOMEM; 2549 return -ENOMEM;
2547 } 2550 }
2548 for (i = 0; i < NR_CPUS; i++) { 2551 for (i = 0; i < nr_cpu_ids; i++) {
2549 struct ext4_locality_group *lg; 2552 struct ext4_locality_group *lg;
2550 lg = &sbi->s_locality_groups[i]; 2553 lg = &sbi->s_locality_groups[i];
2551 mutex_init(&lg->lg_mutex); 2554 mutex_init(&lg->lg_mutex);
2552 INIT_LIST_HEAD(&lg->lg_prealloc_list); 2555 for (j = 0; j < PREALLOC_TB_SIZE; j++)
2556 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
2553 spin_lock_init(&lg->lg_prealloc_lock); 2557 spin_lock_init(&lg->lg_prealloc_lock);
2554 } 2558 }
2555 2559
@@ -3260,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3260 struct ext4_prealloc_space *pa) 3264 struct ext4_prealloc_space *pa)
3261{ 3265{
3262 unsigned int len = ac->ac_o_ex.fe_len; 3266 unsigned int len = ac->ac_o_ex.fe_len;
3267
3263 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, 3268 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3264 &ac->ac_b_ex.fe_group, 3269 &ac->ac_b_ex.fe_group,
3265 &ac->ac_b_ex.fe_start); 3270 &ac->ac_b_ex.fe_start);
@@ -3282,6 +3287,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3282static noinline_for_stack int 3287static noinline_for_stack int
3283ext4_mb_use_preallocated(struct ext4_allocation_context *ac) 3288ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3284{ 3289{
3290 int order, i;
3285 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3291 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3286 struct ext4_locality_group *lg; 3292 struct ext4_locality_group *lg;
3287 struct ext4_prealloc_space *pa; 3293 struct ext4_prealloc_space *pa;
@@ -3322,22 +3328,29 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3322 lg = ac->ac_lg; 3328 lg = ac->ac_lg;
3323 if (lg == NULL) 3329 if (lg == NULL)
3324 return 0; 3330 return 0;
3325 3331 order = fls(ac->ac_o_ex.fe_len) - 1;
3326 rcu_read_lock(); 3332 if (order > PREALLOC_TB_SIZE - 1)
3327 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) { 3333 /* The max size of hash table is PREALLOC_TB_SIZE */
3328 spin_lock(&pa->pa_lock); 3334 order = PREALLOC_TB_SIZE - 1;
3329 if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { 3335
3330 atomic_inc(&pa->pa_count); 3336 for (i = order; i < PREALLOC_TB_SIZE; i++) {
3331 ext4_mb_use_group_pa(ac, pa); 3337 rcu_read_lock();
3338 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
3339 pa_inode_list) {
3340 spin_lock(&pa->pa_lock);
3341 if (pa->pa_deleted == 0 &&
3342 pa->pa_free >= ac->ac_o_ex.fe_len) {
3343 atomic_inc(&pa->pa_count);
3344 ext4_mb_use_group_pa(ac, pa);
3345 spin_unlock(&pa->pa_lock);
3346 ac->ac_criteria = 20;
3347 rcu_read_unlock();
3348 return 1;
3349 }
3332 spin_unlock(&pa->pa_lock); 3350 spin_unlock(&pa->pa_lock);
3333 ac->ac_criteria = 20;
3334 rcu_read_unlock();
3335 return 1;
3336 } 3351 }
3337 spin_unlock(&pa->pa_lock); 3352 rcu_read_unlock();
3338 } 3353 }
3339 rcu_read_unlock();
3340
3341 return 0; 3354 return 0;
3342} 3355}
3343 3356
@@ -3560,6 +3573,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3560 pa->pa_free = pa->pa_len; 3573 pa->pa_free = pa->pa_len;
3561 atomic_set(&pa->pa_count, 1); 3574 atomic_set(&pa->pa_count, 1);
3562 spin_lock_init(&pa->pa_lock); 3575 spin_lock_init(&pa->pa_lock);
3576 INIT_LIST_HEAD(&pa->pa_inode_list);
3563 pa->pa_deleted = 0; 3577 pa->pa_deleted = 0;
3564 pa->pa_linear = 1; 3578 pa->pa_linear = 1;
3565 3579
@@ -3580,10 +3594,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3580 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 3594 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3581 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3595 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3582 3596
3583 spin_lock(pa->pa_obj_lock); 3597 /*
3584 list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list); 3598 * We will later add the new pa to the right bucket
3585 spin_unlock(pa->pa_obj_lock); 3599 * after updating the pa_free in ext4_mb_release_context
3586 3600 */
3587 return 0; 3601 return 0;
3588} 3602}
3589 3603
@@ -3733,20 +3747,23 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3733 3747
3734 bitmap_bh = ext4_read_block_bitmap(sb, group); 3748 bitmap_bh = ext4_read_block_bitmap(sb, group);
3735 if (bitmap_bh == NULL) { 3749 if (bitmap_bh == NULL) {
3736 /* error handling here */ 3750 ext4_error(sb, __func__, "Error in reading block "
3737 ext4_mb_release_desc(&e4b); 3751 "bitmap for %lu\n", group);
3738 BUG_ON(bitmap_bh == NULL); 3752 return 0;
3739 } 3753 }
3740 3754
3741 err = ext4_mb_load_buddy(sb, group, &e4b); 3755 err = ext4_mb_load_buddy(sb, group, &e4b);
3742 BUG_ON(err != 0); /* error handling here */ 3756 if (err) {
3757 ext4_error(sb, __func__, "Error in loading buddy "
3758 "information for %lu\n", group);
3759 put_bh(bitmap_bh);
3760 return 0;
3761 }
3743 3762
3744 if (needed == 0) 3763 if (needed == 0)
3745 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; 3764 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
3746 3765
3747 grp = ext4_get_group_info(sb, group);
3748 INIT_LIST_HEAD(&list); 3766 INIT_LIST_HEAD(&list);
3749
3750 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 3767 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
3751repeat: 3768repeat:
3752 ext4_lock_group(sb, group); 3769 ext4_lock_group(sb, group);
@@ -3903,13 +3920,18 @@ repeat:
3903 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 3920 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
3904 3921
3905 err = ext4_mb_load_buddy(sb, group, &e4b); 3922 err = ext4_mb_load_buddy(sb, group, &e4b);
3906 BUG_ON(err != 0); /* error handling here */ 3923 if (err) {
3924 ext4_error(sb, __func__, "Error in loading buddy "
3925 "information for %lu\n", group);
3926 continue;
3927 }
3907 3928
3908 bitmap_bh = ext4_read_block_bitmap(sb, group); 3929 bitmap_bh = ext4_read_block_bitmap(sb, group);
3909 if (bitmap_bh == NULL) { 3930 if (bitmap_bh == NULL) {
3910 /* error handling here */ 3931 ext4_error(sb, __func__, "Error in reading block "
3932 "bitmap for %lu\n", group);
3911 ext4_mb_release_desc(&e4b); 3933 ext4_mb_release_desc(&e4b);
3912 BUG_ON(bitmap_bh == NULL); 3934 continue;
3913 } 3935 }
3914 3936
3915 ext4_lock_group(sb, group); 3937 ext4_lock_group(sb, group);
@@ -4112,22 +4134,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4112 4134
4113} 4135}
4114 4136
4137static noinline_for_stack void
4138ext4_mb_discard_lg_preallocations(struct super_block *sb,
4139 struct ext4_locality_group *lg,
4140 int order, int total_entries)
4141{
4142 ext4_group_t group = 0;
4143 struct ext4_buddy e4b;
4144 struct list_head discard_list;
4145 struct ext4_prealloc_space *pa, *tmp;
4146 struct ext4_allocation_context *ac;
4147
4148 mb_debug("discard locality group preallocation\n");
4149
4150 INIT_LIST_HEAD(&discard_list);
4151 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4152
4153 spin_lock(&lg->lg_prealloc_lock);
4154 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4155 pa_inode_list) {
4156 spin_lock(&pa->pa_lock);
4157 if (atomic_read(&pa->pa_count)) {
4158 /*
4159 * This is the pa that we just used
4160 * for block allocation. So don't
4161 * free that
4162 */
4163 spin_unlock(&pa->pa_lock);
4164 continue;
4165 }
4166 if (pa->pa_deleted) {
4167 spin_unlock(&pa->pa_lock);
4168 continue;
4169 }
4170 /* only lg prealloc space */
4171 BUG_ON(!pa->pa_linear);
4172
4173 /* seems this one can be freed ... */
4174 pa->pa_deleted = 1;
4175 spin_unlock(&pa->pa_lock);
4176
4177 list_del_rcu(&pa->pa_inode_list);
4178 list_add(&pa->u.pa_tmp_list, &discard_list);
4179
4180 total_entries--;
4181 if (total_entries <= 5) {
4182 /*
4183 * we want to keep only 5 entries
4184 * allowing it to grow to 8. This
4185 * mak sure we don't call discard
4186 * soon for this list.
4187 */
4188 break;
4189 }
4190 }
4191 spin_unlock(&lg->lg_prealloc_lock);
4192
4193 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4194
4195 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4196 if (ext4_mb_load_buddy(sb, group, &e4b)) {
4197 ext4_error(sb, __func__, "Error in loading buddy "
4198 "information for %lu\n", group);
4199 continue;
4200 }
4201 ext4_lock_group(sb, group);
4202 list_del(&pa->pa_group_list);
4203 ext4_mb_release_group_pa(&e4b, pa, ac);
4204 ext4_unlock_group(sb, group);
4205
4206 ext4_mb_release_desc(&e4b);
4207 list_del(&pa->u.pa_tmp_list);
4208 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4209 }
4210 if (ac)
4211 kmem_cache_free(ext4_ac_cachep, ac);
4212}
4213
4214/*
4215 * We have incremented pa_count. So it cannot be freed at this
4216 * point. Also we hold lg_mutex. So no parallel allocation is
4217 * possible from this lg. That means pa_free cannot be updated.
4218 *
4219 * A parallel ext4_mb_discard_group_preallocations is possible.
4220 * which can cause the lg_prealloc_list to be updated.
4221 */
4222
4223static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4224{
4225 int order, added = 0, lg_prealloc_count = 1;
4226 struct super_block *sb = ac->ac_sb;
4227 struct ext4_locality_group *lg = ac->ac_lg;
4228 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
4229
4230 order = fls(pa->pa_free) - 1;
4231 if (order > PREALLOC_TB_SIZE - 1)
4232 /* The max size of hash table is PREALLOC_TB_SIZE */
4233 order = PREALLOC_TB_SIZE - 1;
4234 /* Add the prealloc space to lg */
4235 rcu_read_lock();
4236 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4237 pa_inode_list) {
4238 spin_lock(&tmp_pa->pa_lock);
4239 if (tmp_pa->pa_deleted) {
4240 spin_unlock(&pa->pa_lock);
4241 continue;
4242 }
4243 if (!added && pa->pa_free < tmp_pa->pa_free) {
4244 /* Add to the tail of the previous entry */
4245 list_add_tail_rcu(&pa->pa_inode_list,
4246 &tmp_pa->pa_inode_list);
4247 added = 1;
4248 /*
4249 * we want to count the total
4250 * number of entries in the list
4251 */
4252 }
4253 spin_unlock(&tmp_pa->pa_lock);
4254 lg_prealloc_count++;
4255 }
4256 if (!added)
4257 list_add_tail_rcu(&pa->pa_inode_list,
4258 &lg->lg_prealloc_list[order]);
4259 rcu_read_unlock();
4260
4261 /* Now trim the list to be not more than 8 elements */
4262 if (lg_prealloc_count > 8) {
4263 ext4_mb_discard_lg_preallocations(sb, lg,
4264 order, lg_prealloc_count);
4265 return;
4266 }
4267 return ;
4268}
4269
4115/* 4270/*
4116 * release all resource we used in allocation 4271 * release all resource we used in allocation
4117 */ 4272 */
4118static int ext4_mb_release_context(struct ext4_allocation_context *ac) 4273static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4119{ 4274{
4120 if (ac->ac_pa) { 4275 struct ext4_prealloc_space *pa = ac->ac_pa;
4121 if (ac->ac_pa->pa_linear) { 4276 if (pa) {
4277 if (pa->pa_linear) {
4122 /* see comment in ext4_mb_use_group_pa() */ 4278 /* see comment in ext4_mb_use_group_pa() */
4123 spin_lock(&ac->ac_pa->pa_lock); 4279 spin_lock(&pa->pa_lock);
4124 ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len; 4280 pa->pa_pstart += ac->ac_b_ex.fe_len;
4125 ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len; 4281 pa->pa_lstart += ac->ac_b_ex.fe_len;
4126 ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len; 4282 pa->pa_free -= ac->ac_b_ex.fe_len;
4127 ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len; 4283 pa->pa_len -= ac->ac_b_ex.fe_len;
4128 spin_unlock(&ac->ac_pa->pa_lock); 4284 spin_unlock(&pa->pa_lock);
4285 /*
4286 * We want to add the pa to the right bucket.
4287 * Remove it from the list and while adding
4288 * make sure the list to which we are adding
4289 * doesn't grow big.
4290 */
4291 if (likely(pa->pa_free)) {
4292 spin_lock(pa->pa_obj_lock);
4293 list_del_rcu(&pa->pa_inode_list);
4294 spin_unlock(pa->pa_obj_lock);
4295 ext4_mb_add_n_trim(ac);
4296 }
4129 } 4297 }
4130 ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa); 4298 ext4_mb_put_pa(ac, ac->ac_sb, pa);
4131 } 4299 }
4132 if (ac->ac_bitmap_page) 4300 if (ac->ac_bitmap_page)
4133 page_cache_release(ac->ac_bitmap_page); 4301 page_cache_release(ac->ac_bitmap_page);
@@ -4420,11 +4588,15 @@ do_more:
4420 count -= overflow; 4588 count -= overflow;
4421 } 4589 }
4422 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 4590 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4423 if (!bitmap_bh) 4591 if (!bitmap_bh) {
4592 err = -EIO;
4424 goto error_return; 4593 goto error_return;
4594 }
4425 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 4595 gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
4426 if (!gdp) 4596 if (!gdp) {
4597 err = -EIO;
4427 goto error_return; 4598 goto error_return;
4599 }
4428 4600
4429 if (in_range(ext4_block_bitmap(sb, gdp), block, count) || 4601 if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4430 in_range(ext4_inode_bitmap(sb, gdp), block, count) || 4602 in_range(ext4_inode_bitmap(sb, gdp), block, count) ||