diff options
author | Theodore Ts'o <tytso@mit.edu> | 2009-05-01 08:50:38 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2009-05-01 08:50:38 -0400 |
commit | 8df9675f8b498d0bfa1f0b5b06f56bf1ff366dd5 (patch) | |
tree | 38fd56a82049f50b4d774af47b9d39f116071755 /fs/ext4/ialloc.c | |
parent | 9ca92389c5312a51e819c15c762f0abdc7f3129b (diff) |
ext4: Avoid races caused by on-line resizing and SMP memory reordering
Ext4's on-line resizing adds a new block group and then, only at the
last step adjusts s_groups_count. However, it's possible on SMP
systems that another CPU could see the updated the s_group_count and
not see the newly initialized data structures for the just-added block
group. For this reason, it's important to insert a SMP read barrier
after reading s_groups_count and before reading any (for example) the
new block group descriptors allowed by the increased value of
s_groups_count.
Unfortunately, we rather blatently violate this locking protocol
documented in fs/ext4/resize.c. Fortunately, (1) on-line resizes
happen relatively rarely, and (2) it seems rare that the filesystem
code will immediately try to use just-added block group before any
memory ordering issues resolve themselves. So apparently problems
here are relatively hard to hit, since ext3 has been vulnerable to the
same issue for years with no one apparently complaining.
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/ialloc.c')
-rw-r--r-- | fs/ext4/ialloc.c | 40 |
1 files changed, 19 insertions, 21 deletions
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f18e0a08a6b5..55ba419ca00b 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c | |||
@@ -316,7 +316,7 @@ error_return: | |||
316 | static int find_group_dir(struct super_block *sb, struct inode *parent, | 316 | static int find_group_dir(struct super_block *sb, struct inode *parent, |
317 | ext4_group_t *best_group) | 317 | ext4_group_t *best_group) |
318 | { | 318 | { |
319 | ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; | 319 | ext4_group_t ngroups = ext4_get_groups_count(sb); |
320 | unsigned int freei, avefreei; | 320 | unsigned int freei, avefreei; |
321 | struct ext4_group_desc *desc, *best_desc = NULL; | 321 | struct ext4_group_desc *desc, *best_desc = NULL; |
322 | ext4_group_t group; | 322 | ext4_group_t group; |
@@ -353,7 +353,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, | |||
353 | struct flex_groups *flex_group = sbi->s_flex_groups; | 353 | struct flex_groups *flex_group = sbi->s_flex_groups; |
354 | ext4_group_t parent_group = EXT4_I(parent)->i_block_group; | 354 | ext4_group_t parent_group = EXT4_I(parent)->i_block_group; |
355 | ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); | 355 | ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); |
356 | ext4_group_t ngroups = sbi->s_groups_count; | 356 | ext4_group_t ngroups = ext4_get_groups_count(sb); |
357 | int flex_size = ext4_flex_bg_size(sbi); | 357 | int flex_size = ext4_flex_bg_size(sbi); |
358 | ext4_group_t best_flex = parent_fbg_group; | 358 | ext4_group_t best_flex = parent_fbg_group; |
359 | int blocks_per_flex = sbi->s_blocks_per_group * flex_size; | 359 | int blocks_per_flex = sbi->s_blocks_per_group * flex_size; |
@@ -362,7 +362,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, | |||
362 | ext4_group_t n_fbg_groups; | 362 | ext4_group_t n_fbg_groups; |
363 | ext4_group_t i; | 363 | ext4_group_t i; |
364 | 364 | ||
365 | n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> | 365 | n_fbg_groups = (ngroups + flex_size - 1) >> |
366 | sbi->s_log_groups_per_flex; | 366 | sbi->s_log_groups_per_flex; |
367 | 367 | ||
368 | find_close_to_parent: | 368 | find_close_to_parent: |
@@ -478,20 +478,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, | |||
478 | { | 478 | { |
479 | ext4_group_t parent_group = EXT4_I(parent)->i_block_group; | 479 | ext4_group_t parent_group = EXT4_I(parent)->i_block_group; |
480 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 480 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
481 | ext4_group_t ngroups = sbi->s_groups_count; | 481 | ext4_group_t real_ngroups = ext4_get_groups_count(sb); |
482 | int inodes_per_group = EXT4_INODES_PER_GROUP(sb); | 482 | int inodes_per_group = EXT4_INODES_PER_GROUP(sb); |
483 | unsigned int freei, avefreei; | 483 | unsigned int freei, avefreei; |
484 | ext4_fsblk_t freeb, avefreeb; | 484 | ext4_fsblk_t freeb, avefreeb; |
485 | unsigned int ndirs; | 485 | unsigned int ndirs; |
486 | int max_dirs, min_inodes; | 486 | int max_dirs, min_inodes; |
487 | ext4_grpblk_t min_blocks; | 487 | ext4_grpblk_t min_blocks; |
488 | ext4_group_t i, grp, g; | 488 | ext4_group_t i, grp, g, ngroups; |
489 | struct ext4_group_desc *desc; | 489 | struct ext4_group_desc *desc; |
490 | struct orlov_stats stats; | 490 | struct orlov_stats stats; |
491 | int flex_size = ext4_flex_bg_size(sbi); | 491 | int flex_size = ext4_flex_bg_size(sbi); |
492 | 492 | ||
493 | ngroups = real_ngroups; | ||
493 | if (flex_size > 1) { | 494 | if (flex_size > 1) { |
494 | ngroups = (ngroups + flex_size - 1) >> | 495 | ngroups = (real_ngroups + flex_size - 1) >> |
495 | sbi->s_log_groups_per_flex; | 496 | sbi->s_log_groups_per_flex; |
496 | parent_group >>= sbi->s_log_groups_per_flex; | 497 | parent_group >>= sbi->s_log_groups_per_flex; |
497 | } | 498 | } |
@@ -543,7 +544,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, | |||
543 | */ | 544 | */ |
544 | grp *= flex_size; | 545 | grp *= flex_size; |
545 | for (i = 0; i < flex_size; i++) { | 546 | for (i = 0; i < flex_size; i++) { |
546 | if (grp+i >= sbi->s_groups_count) | 547 | if (grp+i >= real_ngroups) |
547 | break; | 548 | break; |
548 | desc = ext4_get_group_desc(sb, grp+i, NULL); | 549 | desc = ext4_get_group_desc(sb, grp+i, NULL); |
549 | if (desc && ext4_free_inodes_count(sb, desc)) { | 550 | if (desc && ext4_free_inodes_count(sb, desc)) { |
@@ -583,7 +584,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, | |||
583 | } | 584 | } |
584 | 585 | ||
585 | fallback: | 586 | fallback: |
586 | ngroups = sbi->s_groups_count; | 587 | ngroups = real_ngroups; |
587 | avefreei = freei / ngroups; | 588 | avefreei = freei / ngroups; |
588 | fallback_retry: | 589 | fallback_retry: |
589 | parent_group = EXT4_I(parent)->i_block_group; | 590 | parent_group = EXT4_I(parent)->i_block_group; |
@@ -613,9 +614,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, | |||
613 | ext4_group_t *group, int mode) | 614 | ext4_group_t *group, int mode) |
614 | { | 615 | { |
615 | ext4_group_t parent_group = EXT4_I(parent)->i_block_group; | 616 | ext4_group_t parent_group = EXT4_I(parent)->i_block_group; |
616 | ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; | 617 | ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); |
617 | struct ext4_group_desc *desc; | 618 | struct ext4_group_desc *desc; |
618 | ext4_group_t i, last; | ||
619 | int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); | 619 | int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); |
620 | 620 | ||
621 | /* | 621 | /* |
@@ -799,11 +799,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) | |||
799 | struct super_block *sb; | 799 | struct super_block *sb; |
800 | struct buffer_head *inode_bitmap_bh = NULL; | 800 | struct buffer_head *inode_bitmap_bh = NULL; |
801 | struct buffer_head *group_desc_bh; | 801 | struct buffer_head *group_desc_bh; |
802 | ext4_group_t group = 0; | 802 | ext4_group_t ngroups, group = 0; |
803 | unsigned long ino = 0; | 803 | unsigned long ino = 0; |
804 | struct inode *inode; | 804 | struct inode *inode; |
805 | struct ext4_group_desc *gdp = NULL; | 805 | struct ext4_group_desc *gdp = NULL; |
806 | struct ext4_super_block *es; | ||
807 | struct ext4_inode_info *ei; | 806 | struct ext4_inode_info *ei; |
808 | struct ext4_sb_info *sbi; | 807 | struct ext4_sb_info *sbi; |
809 | int ret2, err = 0; | 808 | int ret2, err = 0; |
@@ -818,15 +817,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) | |||
818 | return ERR_PTR(-EPERM); | 817 | return ERR_PTR(-EPERM); |
819 | 818 | ||
820 | sb = dir->i_sb; | 819 | sb = dir->i_sb; |
820 | ngroups = ext4_get_groups_count(sb); | ||
821 | trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, | 821 | trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, |
822 | dir->i_ino, mode); | 822 | dir->i_ino, mode); |
823 | inode = new_inode(sb); | 823 | inode = new_inode(sb); |
824 | if (!inode) | 824 | if (!inode) |
825 | return ERR_PTR(-ENOMEM); | 825 | return ERR_PTR(-ENOMEM); |
826 | ei = EXT4_I(inode); | 826 | ei = EXT4_I(inode); |
827 | |||
828 | sbi = EXT4_SB(sb); | 827 | sbi = EXT4_SB(sb); |
829 | es = sbi->s_es; | ||
830 | 828 | ||
831 | if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { | 829 | if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { |
832 | ret2 = find_group_flex(sb, dir, &group); | 830 | ret2 = find_group_flex(sb, dir, &group); |
@@ -856,7 +854,7 @@ got_group: | |||
856 | if (ret2 == -1) | 854 | if (ret2 == -1) |
857 | goto out; | 855 | goto out; |
858 | 856 | ||
859 | for (i = 0; i < sbi->s_groups_count; i++) { | 857 | for (i = 0; i < ngroups; i++) { |
860 | err = -EIO; | 858 | err = -EIO; |
861 | 859 | ||
862 | gdp = ext4_get_group_desc(sb, group, &group_desc_bh); | 860 | gdp = ext4_get_group_desc(sb, group, &group_desc_bh); |
@@ -917,7 +915,7 @@ repeat_in_this_group: | |||
917 | * group descriptor metadata has not yet been updated. | 915 | * group descriptor metadata has not yet been updated. |
918 | * So we just go onto the next blockgroup. | 916 | * So we just go onto the next blockgroup. |
919 | */ | 917 | */ |
920 | if (++group == sbi->s_groups_count) | 918 | if (++group == ngroups) |
921 | group = 0; | 919 | group = 0; |
922 | } | 920 | } |
923 | err = -ENOSPC; | 921 | err = -ENOSPC; |
@@ -1158,7 +1156,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) | |||
1158 | { | 1156 | { |
1159 | unsigned long desc_count; | 1157 | unsigned long desc_count; |
1160 | struct ext4_group_desc *gdp; | 1158 | struct ext4_group_desc *gdp; |
1161 | ext4_group_t i; | 1159 | ext4_group_t i, ngroups = ext4_get_groups_count(sb); |
1162 | #ifdef EXT4FS_DEBUG | 1160 | #ifdef EXT4FS_DEBUG |
1163 | struct ext4_super_block *es; | 1161 | struct ext4_super_block *es; |
1164 | unsigned long bitmap_count, x; | 1162 | unsigned long bitmap_count, x; |
@@ -1168,7 +1166,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) | |||
1168 | desc_count = 0; | 1166 | desc_count = 0; |
1169 | bitmap_count = 0; | 1167 | bitmap_count = 0; |
1170 | gdp = NULL; | 1168 | gdp = NULL; |
1171 | for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { | 1169 | for (i = 0; i < ngroups; i++) { |
1172 | gdp = ext4_get_group_desc(sb, i, NULL); | 1170 | gdp = ext4_get_group_desc(sb, i, NULL); |
1173 | if (!gdp) | 1171 | if (!gdp) |
1174 | continue; | 1172 | continue; |
@@ -1190,7 +1188,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) | |||
1190 | return desc_count; | 1188 | return desc_count; |
1191 | #else | 1189 | #else |
1192 | desc_count = 0; | 1190 | desc_count = 0; |
1193 | for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { | 1191 | for (i = 0; i < ngroups; i++) { |
1194 | gdp = ext4_get_group_desc(sb, i, NULL); | 1192 | gdp = ext4_get_group_desc(sb, i, NULL); |
1195 | if (!gdp) | 1193 | if (!gdp) |
1196 | continue; | 1194 | continue; |
@@ -1205,9 +1203,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) | |||
1205 | unsigned long ext4_count_dirs(struct super_block * sb) | 1203 | unsigned long ext4_count_dirs(struct super_block * sb) |
1206 | { | 1204 | { |
1207 | unsigned long count = 0; | 1205 | unsigned long count = 0; |
1208 | ext4_group_t i; | 1206 | ext4_group_t i, ngroups = ext4_get_groups_count(sb); |
1209 | 1207 | ||
1210 | for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { | 1208 | for (i = 0; i < ngroups; i++) { |
1211 | struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); | 1209 | struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); |
1212 | if (!gdp) | 1210 | if (!gdp) |
1213 | continue; | 1211 | continue; |