aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJose R. Santos <jrs@us.ibm.com>2008-07-11 19:27:31 -0400
committerTheodore Ts'o <tytso@mit.edu>2008-07-11 19:27:31 -0400
commit772cb7c83ba256a11c7bf99a11bef3858d23767c (patch)
treea42b97e5cbd870a76b2646c2dcb658a92c53f637
parent736603ab297506f4396cb5af592004499950fcfd (diff)
ext4: New inode allocation for FLEX_BG meta-data groups.
This patch mostly controls the way inode are allocated in order to make ialloc aware of flex_bg block group grouping. It achieves this by bypassing the Orlov allocator when block group meta-data are packed toghether through mke2fs. Since the impact on the block allocator is minimal, this patch should have little or no effect on other block allocation algorithms. By controlling the inode allocation, it can basically control where the initial search for new block begins and thus indirectly manipulate the block allocator. This allocator favors data and meta-data locality so the disk will gradually be filled from block group zero upward. This helps improve performance by reducing seek time. Since the group of inode tables within one flex_bg are treated as one giant inode table, uninitialized block groups would not need to partially initialize as many inode table as with Orlov which would help fsck time as the filesystem usage goes up. Signed-off-by: Jose R. Santos <jrs@us.ibm.com> Signed-off-by: Valerie Clement <valerie.clement@bull.net> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/ext4.h25
-rw-r--r--fs/ext4/ext4_sb.h3
-rw-r--r--fs/ext4/ialloc.c96
-rw-r--r--fs/ext4/mballoc.c15
-rw-r--r--fs/ext4/super.c57
6 files changed, 209 insertions, 1 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index ba411233cc25..0b2b7549ac63 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -809,6 +809,13 @@ do_more:
809 spin_unlock(sb_bgl_lock(sbi, block_group)); 809 spin_unlock(sb_bgl_lock(sbi, block_group));
810 percpu_counter_add(&sbi->s_freeblocks_counter, count); 810 percpu_counter_add(&sbi->s_freeblocks_counter, count);
811 811
812 if (sbi->s_log_groups_per_flex) {
813 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
814 spin_lock(sb_bgl_lock(sbi, flex_group));
815 sbi->s_flex_groups[flex_group].free_blocks += count;
816 spin_unlock(sb_bgl_lock(sbi, flex_group));
817 }
818
812 /* We dirtied the bitmap block */ 819 /* We dirtied the bitmap block */
813 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 820 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
814 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 821 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1883,6 +1890,13 @@ allocated:
1883 spin_unlock(sb_bgl_lock(sbi, group_no)); 1890 spin_unlock(sb_bgl_lock(sbi, group_no));
1884 percpu_counter_sub(&sbi->s_freeblocks_counter, num); 1891 percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1885 1892
1893 if (sbi->s_log_groups_per_flex) {
1894 ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
1895 spin_lock(sb_bgl_lock(sbi, flex_group));
1896 sbi->s_flex_groups[flex_group].free_blocks -= num;
1897 spin_unlock(sb_bgl_lock(sbi, flex_group));
1898 }
1899
1886 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); 1900 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1887 err = ext4_journal_dirty_metadata(handle, gdp_bh); 1901 err = ext4_journal_dirty_metadata(handle, gdp_bh);
1888 if (!fatal) 1902 if (!fatal)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 109c7d4c19ad..0bfeae18f1a2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -170,6 +170,15 @@ struct ext4_group_desc
170 __u32 bg_reserved2[3]; 170 __u32 bg_reserved2[3];
171}; 171};
172 172
173/*
174 * Structure of a flex block group info
175 */
176
177struct flex_groups {
178 __u32 free_inodes;
179 __u32 free_blocks;
180};
181
173#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ 182#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
174#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ 183#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
175#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ 184#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
@@ -647,7 +656,10 @@ struct ext4_super_block {
647 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ 656 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
648 __le64 s_mmp_block; /* Block for multi-mount protection */ 657 __le64 s_mmp_block; /* Block for multi-mount protection */
649 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 658 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
650 __u32 s_reserved[163]; /* Padding to the end of the block */ 659 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
660 __u8 s_reserved_char_pad2;
661 __le16 s_reserved_pad;
662 __u32 s_reserved[162]; /* Padding to the end of the block */
651}; 663};
652 664
653#ifdef __KERNEL__ 665#ifdef __KERNEL__
@@ -1160,6 +1172,17 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
1160} 1172}
1161 1173
1162 1174
1175static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
1176 ext4_group_t block_group)
1177{
1178 return block_group >> sbi->s_log_groups_per_flex;
1179}
1180
1181static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
1182{
1183 return 1 << sbi->s_log_groups_per_flex;
1184}
1185
1163#define ext4_std_error(sb, errno) \ 1186#define ext4_std_error(sb, errno) \
1164do { \ 1187do { \
1165 if ((errno)) \ 1188 if ((errno)) \
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 4de9a75ca6af..6300226d5531 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -143,6 +143,9 @@ struct ext4_sb_info {
143 143
144 /* locality groups */ 144 /* locality groups */
145 struct ext4_locality_group *s_locality_groups; 145 struct ext4_locality_group *s_locality_groups;
146
147 unsigned int s_log_groups_per_flex;
148 struct flex_groups *s_flex_groups;
146}; 149};
147 150
148#endif /* _EXT4_SB */ 151#endif /* _EXT4_SB */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b30cc79b9fcb..8b0a10acd708 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
157 struct ext4_super_block * es; 157 struct ext4_super_block * es;
158 struct ext4_sb_info *sbi; 158 struct ext4_sb_info *sbi;
159 int fatal = 0, err; 159 int fatal = 0, err;
160 ext4_group_t flex_group;
160 161
161 if (atomic_read(&inode->i_count) > 1) { 162 if (atomic_read(&inode->i_count) > 1) {
162 printk ("ext4_free_inode: inode has count=%d\n", 163 printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
232 if (is_directory) 233 if (is_directory)
233 percpu_counter_dec(&sbi->s_dirs_counter); 234 percpu_counter_dec(&sbi->s_dirs_counter);
234 235
236 if (sbi->s_log_groups_per_flex) {
237 flex_group = ext4_flex_group(sbi, block_group);
238 spin_lock(sb_bgl_lock(sbi, flex_group));
239 sbi->s_flex_groups[flex_group].free_inodes++;
240 spin_unlock(sb_bgl_lock(sbi, flex_group));
241 }
235 } 242 }
236 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); 243 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
237 err = ext4_journal_dirty_metadata(handle, bh2); 244 err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
286 return ret; 293 return ret;
287} 294}
288 295
296#define free_block_ratio 10
297
298static int find_group_flex(struct super_block *sb, struct inode *parent,
299 ext4_group_t *best_group)
300{
301 struct ext4_sb_info *sbi = EXT4_SB(sb);
302 struct ext4_group_desc *desc;
303 struct buffer_head *bh;
304 struct flex_groups *flex_group = sbi->s_flex_groups;
305 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
306 ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
307 ext4_group_t ngroups = sbi->s_groups_count;
308 int flex_size = ext4_flex_bg_size(sbi);
309 ext4_group_t best_flex = parent_fbg_group;
310 int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
311 int flexbg_free_blocks;
312 int flex_freeb_ratio;
313 ext4_group_t n_fbg_groups;
314 ext4_group_t i;
315
316 n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
317 sbi->s_log_groups_per_flex;
318
319find_close_to_parent:
320 flexbg_free_blocks = flex_group[best_flex].free_blocks;
321 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
322 if (flex_group[best_flex].free_inodes &&
323 flex_freeb_ratio > free_block_ratio)
324 goto found_flexbg;
325
326 if (best_flex && best_flex == parent_fbg_group) {
327 best_flex--;
328 goto find_close_to_parent;
329 }
330
331 for (i = 0; i < n_fbg_groups; i++) {
332 if (i == parent_fbg_group || i == parent_fbg_group - 1)
333 continue;
334
335 flexbg_free_blocks = flex_group[i].free_blocks;
336 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
337
338 if (flex_freeb_ratio > free_block_ratio &&
339 flex_group[i].free_inodes) {
340 best_flex = i;
341 goto found_flexbg;
342 }
343
344 if (best_flex < 0 ||
345 (flex_group[i].free_blocks >
346 flex_group[best_flex].free_blocks &&
347 flex_group[i].free_inodes))
348 best_flex = i;
349 }
350
351 if (!flex_group[best_flex].free_inodes ||
352 !flex_group[best_flex].free_blocks)
353 return -1;
354
355found_flexbg:
356 for (i = best_flex * flex_size; i < ngroups &&
357 i < (best_flex + 1) * flex_size; i++) {
358 desc = ext4_get_group_desc(sb, i, &bh);
359 if (le16_to_cpu(desc->bg_free_inodes_count)) {
360 *best_group = i;
361 goto out;
362 }
363 }
364
365 return -1;
366out:
367 return 0;
368}
369
289/* 370/*
290 * Orlov's allocator for directories. 371 * Orlov's allocator for directories.
291 * 372 *
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
501 struct inode *ret; 582 struct inode *ret;
502 ext4_group_t i; 583 ext4_group_t i;
503 int free = 0; 584 int free = 0;
585 ext4_group_t flex_group;
504 586
505 /* Cannot create files in a deleted directory */ 587 /* Cannot create files in a deleted directory */
506 if (!dir || !dir->i_nlink) 588 if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
514 596
515 sbi = EXT4_SB(sb); 597 sbi = EXT4_SB(sb);
516 es = sbi->s_es; 598 es = sbi->s_es;
599
600 if (sbi->s_log_groups_per_flex) {
601 ret2 = find_group_flex(sb, dir, &group);
602 goto got_group;
603 }
604
517 if (S_ISDIR(mode)) { 605 if (S_ISDIR(mode)) {
518 if (test_opt (sb, OLDALLOC)) 606 if (test_opt (sb, OLDALLOC))
519 ret2 = find_group_dir(sb, dir, &group); 607 ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
522 } else 610 } else
523 ret2 = find_group_other(sb, dir, &group); 611 ret2 = find_group_other(sb, dir, &group);
524 612
613got_group:
525 err = -ENOSPC; 614 err = -ENOSPC;
526 if (ret2 == -1) 615 if (ret2 == -1)
527 goto out; 616 goto out;
@@ -676,6 +765,13 @@ got:
676 percpu_counter_inc(&sbi->s_dirs_counter); 765 percpu_counter_inc(&sbi->s_dirs_counter);
677 sb->s_dirt = 1; 766 sb->s_dirt = 1;
678 767
768 if (sbi->s_log_groups_per_flex) {
769 flex_group = ext4_flex_group(sbi, group);
770 spin_lock(sb_bgl_lock(sbi, flex_group));
771 sbi->s_flex_groups[flex_group].free_inodes--;
772 spin_unlock(sb_bgl_lock(sbi, flex_group));
773 }
774
679 inode->i_uid = current->fsuid; 775 inode->i_uid = current->fsuid;
680 if (test_opt (sb, GRPID)) 776 if (test_opt (sb, GRPID))
681 inode->i_gid = dir->i_gid; 777 inode->i_gid = dir->i_gid;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b882868f4661..5dcb826401bb 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2842,6 +2842,14 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2842 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); 2842 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2843 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 2843 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
2844 2844
2845 if (sbi->s_log_groups_per_flex) {
2846 ext4_group_t flex_group = ext4_flex_group(sbi,
2847 ac->ac_b_ex.fe_group);
2848 spin_lock(sb_bgl_lock(sbi, flex_group));
2849 sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
2850 spin_unlock(sb_bgl_lock(sbi, flex_group));
2851 }
2852
2845 err = ext4_journal_dirty_metadata(handle, bitmap_bh); 2853 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
2846 if (err) 2854 if (err)
2847 goto out_err; 2855 goto out_err;
@@ -4342,6 +4350,13 @@ do_more:
4342 spin_unlock(sb_bgl_lock(sbi, block_group)); 4350 spin_unlock(sb_bgl_lock(sbi, block_group));
4343 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4351 percpu_counter_add(&sbi->s_freeblocks_counter, count);
4344 4352
4353 if (sbi->s_log_groups_per_flex) {
4354 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4355 spin_lock(sb_bgl_lock(sbi, flex_group));
4356 sbi->s_flex_groups[flex_group].free_blocks += count;
4357 spin_unlock(sb_bgl_lock(sbi, flex_group));
4358 }
4359
4345 ext4_mb_release_desc(&e4b); 4360 ext4_mb_release_desc(&e4b);
4346 4361
4347 *freed += count; 4362 *freed += count;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 588cfb408642..b9ad3d852061 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -517,6 +517,7 @@ static void ext4_put_super (struct super_block * sb)
517 for (i = 0; i < sbi->s_gdb_count; i++) 517 for (i = 0; i < sbi->s_gdb_count; i++)
518 brelse(sbi->s_group_desc[i]); 518 brelse(sbi->s_group_desc[i]);
519 kfree(sbi->s_group_desc); 519 kfree(sbi->s_group_desc);
520 kfree(sbi->s_flex_groups);
520 percpu_counter_destroy(&sbi->s_freeblocks_counter); 521 percpu_counter_destroy(&sbi->s_freeblocks_counter);
521 percpu_counter_destroy(&sbi->s_freeinodes_counter); 522 percpu_counter_destroy(&sbi->s_freeinodes_counter);
522 percpu_counter_destroy(&sbi->s_dirs_counter); 523 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1442,6 +1443,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1442 return res; 1443 return res;
1443} 1444}
1444 1445
1446static int ext4_fill_flex_info(struct super_block *sb)
1447{
1448 struct ext4_sb_info *sbi = EXT4_SB(sb);
1449 struct ext4_group_desc *gdp = NULL;
1450 struct buffer_head *bh;
1451 ext4_group_t flex_group_count;
1452 ext4_group_t flex_group;
1453 int groups_per_flex = 0;
1454 __u64 block_bitmap = 0;
1455 int i;
1456
1457 if (!sbi->s_es->s_log_groups_per_flex) {
1458 sbi->s_log_groups_per_flex = 0;
1459 return 1;
1460 }
1461
1462 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1463 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1464
1465 flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
1466 groups_per_flex;
1467 sbi->s_flex_groups = kmalloc(flex_group_count *
1468 sizeof(struct flex_groups), GFP_KERNEL);
1469 if (sbi->s_flex_groups == NULL) {
1470 printk(KERN_ERR "EXT4-fs: not enough memory\n");
1471 goto failed;
1472 }
1473 memset(sbi->s_flex_groups, 0, flex_group_count *
1474 sizeof(struct flex_groups));
1475
1476 gdp = ext4_get_group_desc(sb, 1, &bh);
1477 block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
1478
1479 for (i = 0; i < sbi->s_groups_count; i++) {
1480 gdp = ext4_get_group_desc(sb, i, &bh);
1481
1482 flex_group = ext4_flex_group(sbi, i);
1483 sbi->s_flex_groups[flex_group].free_inodes +=
1484 le16_to_cpu(gdp->bg_free_inodes_count);
1485 sbi->s_flex_groups[flex_group].free_blocks +=
1486 le16_to_cpu(gdp->bg_free_blocks_count);
1487 }
1488
1489 return 1;
1490failed:
1491 return 0;
1492}
1493
1445__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, 1494__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
1446 struct ext4_group_desc *gdp) 1495 struct ext4_group_desc *gdp)
1447{ 1496{
@@ -2137,6 +2186,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
2137 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); 2186 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
2138 goto failed_mount2; 2187 goto failed_mount2;
2139 } 2188 }
2189 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
2190 if (!ext4_fill_flex_info(sb)) {
2191 printk(KERN_ERR
2192 "EXT4-fs: unable to initialize "
2193 "flex_bg meta info!\n");
2194 goto failed_mount2;
2195 }
2196
2140 sbi->s_gdb_count = db_count; 2197 sbi->s_gdb_count = db_count;
2141 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 2198 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
2142 spin_lock_init(&sbi->s_next_gen_lock); 2199 spin_lock_init(&sbi->s_next_gen_lock);