aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/ialloc.c
diff options
context:
space:
mode:
authorTheodore Ts'o <tytso@mit.edu>2009-03-12 12:18:34 -0400
committerTheodore Ts'o <tytso@mit.edu>2009-03-12 12:18:34 -0400
commita4912123b688e057084e6557cef8924f7ae5bbde (patch)
tree34e88705d6617b52caa0f87692b480119a9c9e2e /fs/ext4/ialloc.c
parent2dc6b0d48ca0599837df21b14bb8393d0804af57 (diff)
ext4: New inode/block allocation algorithms for flex_bg filesystems
The find_group_flex() inode allocator is now only used if the filesystem is mounted using the "oldalloc" mount option. It is replaced with the original Orlov allocator that has been updated for flex_bg filesystems (it should behave the same way if flex_bg is disabled). The inode allocator now functions by taking into account each flex_bg group, instead of each block group, when deciding whether or not it's time to allocate a new directory into a fresh flex_bg. The block allocator has also been changed so that the first block group in each flex_bg is preferred for use for storing directory blocks. This keeps directory blocks close together, which is good for speeding up e2fsck since large directories are more likely to look like this: debugfs: stat /home/tytso/Maildir/cur Inode: 1844562 Type: directory Mode: 0700 Flags: 0x81000 Generation: 1132745781 Version: 0x00000000:0000ad71 User: 15806 Group: 15806 Size: 1060864 File ACL: 0 Directory ACL: 0 Links: 2 Blockcount: 2072 Fragment: Address: 0 Number: 0 Size: 0 ctime: 0x499c0ff4:164961f4 -- Wed Feb 18 08:41:08 2009 atime: 0x499c0ff4:00000000 -- Wed Feb 18 08:41:08 2009 mtime: 0x49957f51:00000000 -- Fri Feb 13 09:10:25 2009 crtime: 0x499c0f57:00d51440 -- Wed Feb 18 08:38:31 2009 Size of extra inode fields: 28 BLOCKS: (0):7348651, (1-258):7348654-7348911 TOTAL: 259 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/ialloc.c')
-rw-r--r--fs/ext4/ialloc.c215
1 files changed, 159 insertions, 56 deletions
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index ae3eb57dccdd..617f5a2d800a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -410,6 +410,43 @@ out:
410 return 0; 410 return 0;
411} 411}
412 412
413struct orlov_stats {
414 __u32 free_inodes;
415 __u32 free_blocks;
416 __u32 used_dirs;
417};
418
419/*
420 * Helper function for Orlov's allocator; returns critical information
421 * for a particular block group or flex_bg. If flex_size is 1, then g
422 * is a block group number; otherwise it is flex_bg number.
423 */
424void get_orlov_stats(struct super_block *sb, ext4_group_t g,
425 int flex_size, struct orlov_stats *stats)
426{
427 struct ext4_group_desc *desc;
428 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
429 int i;
430
431 stats->free_inodes = 0;
432 stats->free_blocks = 0;
433 stats->used_dirs = 0;
434
435 g *= flex_size;
436
437 for (i = 0; i < flex_size; i++) {
438 if (g >= ngroups)
439 break;
440 desc = ext4_get_group_desc(sb, g++, NULL);
441 if (!desc)
442 continue;
443
444 stats->free_inodes += ext4_free_inodes_count(sb, desc);
445 stats->free_blocks += ext4_free_blks_count(sb, desc);
446 stats->used_dirs += ext4_used_dirs_count(sb, desc);
447 }
448}
449
413/* 450/*
414 * Orlov's allocator for directories. 451 * Orlov's allocator for directories.
415 * 452 *
@@ -425,35 +462,34 @@ out:
425 * it has too many directories already (max_dirs) or 462 * it has too many directories already (max_dirs) or
426 * it has too few free inodes left (min_inodes) or 463 * it has too few free inodes left (min_inodes) or
427 * it has too few free blocks left (min_blocks) or 464 * it has too few free blocks left (min_blocks) or
428 * it's already running too large debt (max_debt).
429 * Parent's group is preferred, if it doesn't satisfy these 465 * Parent's group is preferred, if it doesn't satisfy these
430 * conditions we search cyclically through the rest. If none 466 * conditions we search cyclically through the rest. If none
431 * of the groups look good we just look for a group with more 467 * of the groups look good we just look for a group with more
432 * free inodes than average (starting at parent's group). 468 * free inodes than average (starting at parent's group).
433 *
434 * Debt is incremented each time we allocate a directory and decremented
435 * when we allocate an inode, within 0--255.
436 */ 469 */
437 470
438#define INODE_COST 64
439#define BLOCK_COST 256
440
441static int find_group_orlov(struct super_block *sb, struct inode *parent, 471static int find_group_orlov(struct super_block *sb, struct inode *parent,
442 ext4_group_t *group) 472 ext4_group_t *group, int mode)
443{ 473{
444 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 474 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
445 struct ext4_sb_info *sbi = EXT4_SB(sb); 475 struct ext4_sb_info *sbi = EXT4_SB(sb);
446 struct ext4_super_block *es = sbi->s_es;
447 ext4_group_t ngroups = sbi->s_groups_count; 476 ext4_group_t ngroups = sbi->s_groups_count;
448 int inodes_per_group = EXT4_INODES_PER_GROUP(sb); 477 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
449 unsigned int freei, avefreei; 478 unsigned int freei, avefreei;
450 ext4_fsblk_t freeb, avefreeb; 479 ext4_fsblk_t freeb, avefreeb;
451 ext4_fsblk_t blocks_per_dir;
452 unsigned int ndirs; 480 unsigned int ndirs;
453 int max_debt, max_dirs, min_inodes; 481 int max_dirs, min_inodes;
454 ext4_grpblk_t min_blocks; 482 ext4_grpblk_t min_blocks;
455 ext4_group_t i; 483 ext4_group_t i, grp, g;
456 struct ext4_group_desc *desc; 484 struct ext4_group_desc *desc;
485 struct orlov_stats stats;
486 int flex_size = ext4_flex_bg_size(sbi);
487
488 if (flex_size > 1) {
489 ngroups = (ngroups + flex_size - 1) >>
490 sbi->s_log_groups_per_flex;
491 parent_group >>= sbi->s_log_groups_per_flex;
492 }
457 493
458 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); 494 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
459 avefreei = freei / ngroups; 495 avefreei = freei / ngroups;
@@ -462,71 +498,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
462 do_div(avefreeb, ngroups); 498 do_div(avefreeb, ngroups);
463 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); 499 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
464 500
465 if ((parent == sb->s_root->d_inode) || 501 if (S_ISDIR(mode) &&
466 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { 502 ((parent == sb->s_root->d_inode) ||
503 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
467 int best_ndir = inodes_per_group; 504 int best_ndir = inodes_per_group;
468 ext4_group_t grp;
469 int ret = -1; 505 int ret = -1;
470 506
471 get_random_bytes(&grp, sizeof(grp)); 507 get_random_bytes(&grp, sizeof(grp));
472 parent_group = (unsigned)grp % ngroups; 508 parent_group = (unsigned)grp % ngroups;
473 for (i = 0; i < ngroups; i++) { 509 for (i = 0; i < ngroups; i++) {
474 grp = (parent_group + i) % ngroups; 510 g = (parent_group + i) % ngroups;
475 desc = ext4_get_group_desc(sb, grp, NULL); 511 get_orlov_stats(sb, g, flex_size, &stats);
476 if (!desc || !ext4_free_inodes_count(sb, desc)) 512 if (!stats.free_inodes)
477 continue; 513 continue;
478 if (ext4_used_dirs_count(sb, desc) >= best_ndir) 514 if (stats.used_dirs >= best_ndir)
479 continue; 515 continue;
480 if (ext4_free_inodes_count(sb, desc) < avefreei) 516 if (stats.free_inodes < avefreei)
481 continue; 517 continue;
482 if (ext4_free_blks_count(sb, desc) < avefreeb) 518 if (stats.free_blocks < avefreeb)
483 continue; 519 continue;
484 *group = grp; 520 grp = g;
485 ret = 0; 521 ret = 0;
486 best_ndir = ext4_used_dirs_count(sb, desc); 522 best_ndir = stats.used_dirs;
523 }
524 if (ret)
525 goto fallback;
526 found_flex_bg:
527 if (flex_size == 1) {
528 *group = grp;
529 return 0;
530 }
531
532 /*
533 * We pack inodes at the beginning of the flexgroup's
534 * inode tables. Block allocation decisions will do
535 * something similar, although regular files will
536 * start at 2nd block group of the flexgroup. See
537 * ext4_ext_find_goal() and ext4_find_near().
538 */
539 grp *= flex_size;
540 for (i = 0; i < flex_size; i++) {
541 if (grp+i >= sbi->s_groups_count)
542 break;
543 desc = ext4_get_group_desc(sb, grp+i, NULL);
544 if (desc && ext4_free_inodes_count(sb, desc)) {
545 *group = grp+i;
546 return 0;
547 }
487 } 548 }
488 if (ret == 0)
489 return ret;
490 goto fallback; 549 goto fallback;
491 } 550 }
492 551
493 blocks_per_dir = ext4_blocks_count(es) - freeb;
494 do_div(blocks_per_dir, ndirs);
495
496 max_dirs = ndirs / ngroups + inodes_per_group / 16; 552 max_dirs = ndirs / ngroups + inodes_per_group / 16;
497 min_inodes = avefreei - inodes_per_group / 4; 553 min_inodes = avefreei - inodes_per_group*flex_size / 4;
498 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4; 554 if (min_inodes < 1)
499 555 min_inodes = 1;
500 max_debt = EXT4_BLOCKS_PER_GROUP(sb); 556 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
501 max_debt /= max_t(int, blocks_per_dir, BLOCK_COST); 557
502 if (max_debt * INODE_COST > inodes_per_group) 558 /*
503 max_debt = inodes_per_group / INODE_COST; 559 * Start looking in the flex group where we last allocated an
504 if (max_debt > 255) 560 * inode for this parent directory
505 max_debt = 255; 561 */
506 if (max_debt == 0) 562 if (EXT4_I(parent)->i_last_alloc_group != ~0) {
507 max_debt = 1; 563 parent_group = EXT4_I(parent)->i_last_alloc_group;
564 if (flex_size > 1)
565 parent_group >>= sbi->s_log_groups_per_flex;
566 }
508 567
509 for (i = 0; i < ngroups; i++) { 568 for (i = 0; i < ngroups; i++) {
510 *group = (parent_group + i) % ngroups; 569 grp = (parent_group + i) % ngroups;
511 desc = ext4_get_group_desc(sb, *group, NULL); 570 get_orlov_stats(sb, grp, flex_size, &stats);
512 if (!desc || !ext4_free_inodes_count(sb, desc)) 571 if (stats.used_dirs >= max_dirs)
513 continue;
514 if (ext4_used_dirs_count(sb, desc) >= max_dirs)
515 continue; 572 continue;
516 if (ext4_free_inodes_count(sb, desc) < min_inodes) 573 if (stats.free_inodes < min_inodes)
517 continue; 574 continue;
518 if (ext4_free_blks_count(sb, desc) < min_blocks) 575 if (stats.free_blocks < min_blocks)
519 continue; 576 continue;
520 return 0; 577 goto found_flex_bg;
521 } 578 }
522 579
523fallback: 580fallback:
581 ngroups = sbi->s_groups_count;
582 avefreei = freei / ngroups;
583 parent_group = EXT4_I(parent)->i_block_group;
524 for (i = 0; i < ngroups; i++) { 584 for (i = 0; i < ngroups; i++) {
525 *group = (parent_group + i) % ngroups; 585 grp = (parent_group + i) % ngroups;
526 desc = ext4_get_group_desc(sb, *group, NULL); 586 desc = ext4_get_group_desc(sb, grp, NULL);
527 if (desc && ext4_free_inodes_count(sb, desc) && 587 if (desc && ext4_free_inodes_count(sb, desc) &&
528 ext4_free_inodes_count(sb, desc) >= avefreei) 588 ext4_free_inodes_count(sb, desc) >= avefreei) {
589 *group = grp;
529 return 0; 590 return 0;
591 }
530 } 592 }
531 593
532 if (avefreei) { 594 if (avefreei) {
@@ -542,12 +604,51 @@ fallback:
542} 604}
543 605
544static int find_group_other(struct super_block *sb, struct inode *parent, 606static int find_group_other(struct super_block *sb, struct inode *parent,
545 ext4_group_t *group) 607 ext4_group_t *group, int mode)
546{ 608{
547 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 609 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
548 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 610 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
549 struct ext4_group_desc *desc; 611 struct ext4_group_desc *desc;
550 ext4_group_t i; 612 ext4_group_t i, last;
613 int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
614
615 /*
616 * Try to place the inode is the same flex group as its
617 * parent. If we can't find space, use the Orlov algorithm to
618 * find another flex group, and store that information in the
619 * parent directory's inode information so that use that flex
620 * group for future allocations.
621 */
622 if (flex_size > 1) {
623 int retry = 0;
624
625 try_again:
626 parent_group &= ~(flex_size-1);
627 last = parent_group + flex_size;
628 if (last > ngroups)
629 last = ngroups;
630 for (i = parent_group; i < last; i++) {
631 desc = ext4_get_group_desc(sb, i, NULL);
632 if (desc && ext4_free_inodes_count(sb, desc)) {
633 *group = i;
634 return 0;
635 }
636 }
637 if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
638 retry = 1;
639 parent_group = EXT4_I(parent)->i_last_alloc_group;
640 goto try_again;
641 }
642 /*
643 * If this didn't work, use the Orlov search algorithm
644 * to find a new flex group; we pass in the mode to
645 * avoid the topdir algorithms.
646 */
647 *group = parent_group + flex_size;
648 if (*group > ngroups)
649 *group = 0;
650 return find_group_orlov(sb, parent, group, mode);
651 }
551 652
552 /* 653 /*
553 * Try to place the inode in its parent directory 654 * Try to place the inode in its parent directory
@@ -716,10 +817,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
716 sbi = EXT4_SB(sb); 817 sbi = EXT4_SB(sb);
717 es = sbi->s_es; 818 es = sbi->s_es;
718 819
719 if (sbi->s_log_groups_per_flex) { 820 if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
720 ret2 = find_group_flex(sb, dir, &group); 821 ret2 = find_group_flex(sb, dir, &group);
721 if (ret2 == -1) { 822 if (ret2 == -1) {
722 ret2 = find_group_other(sb, dir, &group); 823 ret2 = find_group_other(sb, dir, &group, mode);
723 if (ret2 == 0 && once) 824 if (ret2 == 0 && once)
724 once = 0; 825 once = 0;
725 printk(KERN_NOTICE "ext4: find_group_flex " 826 printk(KERN_NOTICE "ext4: find_group_flex "
@@ -733,11 +834,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
733 if (test_opt(sb, OLDALLOC)) 834 if (test_opt(sb, OLDALLOC))
734 ret2 = find_group_dir(sb, dir, &group); 835 ret2 = find_group_dir(sb, dir, &group);
735 else 836 else
736 ret2 = find_group_orlov(sb, dir, &group); 837 ret2 = find_group_orlov(sb, dir, &group, mode);
737 } else 838 } else
738 ret2 = find_group_other(sb, dir, &group); 839 ret2 = find_group_other(sb, dir, &group, mode);
739 840
740got_group: 841got_group:
842 EXT4_I(dir)->i_last_alloc_group = group;
741 err = -ENOSPC; 843 err = -ENOSPC;
742 if (ret2 == -1) 844 if (ret2 == -1)
743 goto out; 845 goto out;
@@ -894,6 +996,7 @@ got:
894 ei->i_file_acl = 0; 996 ei->i_file_acl = 0;
895 ei->i_dtime = 0; 997 ei->i_dtime = 0;
896 ei->i_block_group = group; 998 ei->i_block_group = group;
999 ei->i_last_alloc_group = ~0;
897 1000
898 ext4_set_inode_flags(inode); 1001 ext4_set_inode_flags(inode);
899 if (IS_DIRSYNC(inode)) 1002 if (IS_DIRSYNC(inode))