diff options
author | Theodore Ts'o <tytso@mit.edu> | 2009-03-12 12:18:34 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2009-03-12 12:18:34 -0400 |
commit | a4912123b688e057084e6557cef8924f7ae5bbde (patch) | |
tree | 34e88705d6617b52caa0f87692b480119a9c9e2e /fs/ext4/ialloc.c | |
parent | 2dc6b0d48ca0599837df21b14bb8393d0804af57 (diff) |
ext4: New inode/block allocation algorithms for flex_bg filesystems
The find_group_flex() inode allocator is now only used if the
filesystem is mounted using the "oldalloc" mount option. It is
replaced with the original Orlov allocator that has been updated for
flex_bg filesystems (it should behave the same way if flex_bg is
disabled). The inode allocator now functions by taking into account
each flex_bg group, instead of each block group, when deciding whether
or not it's time to allocate a new directory into a fresh flex_bg.
The block allocator has also been changed so that the first block
group in each flex_bg is preferred for use for storing directory
blocks. This keeps directory blocks close together, which is good for
speeding up e2fsck since large directories are more likely to look
like this:
debugfs: stat /home/tytso/Maildir/cur
Inode: 1844562 Type: directory Mode: 0700 Flags: 0x81000
Generation: 1132745781 Version: 0x00000000:0000ad71
User: 15806 Group: 15806 Size: 1060864
File ACL: 0 Directory ACL: 0
Links: 2 Blockcount: 2072
Fragment: Address: 0 Number: 0 Size: 0
ctime: 0x499c0ff4:164961f4 -- Wed Feb 18 08:41:08 2009
atime: 0x499c0ff4:00000000 -- Wed Feb 18 08:41:08 2009
mtime: 0x49957f51:00000000 -- Fri Feb 13 09:10:25 2009
crtime: 0x499c0f57:00d51440 -- Wed Feb 18 08:38:31 2009
Size of extra inode fields: 28
BLOCKS:
(0):7348651, (1-258):7348654-7348911
TOTAL: 259
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/ialloc.c')
-rw-r--r-- | fs/ext4/ialloc.c | 215 |
1 files changed, 159 insertions, 56 deletions
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index ae3eb57dccdd..617f5a2d800a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c | |||
@@ -410,6 +410,43 @@ out: | |||
410 | return 0; | 410 | return 0; |
411 | } | 411 | } |
412 | 412 | ||
413 | struct orlov_stats { | ||
414 | __u32 free_inodes; | ||
415 | __u32 free_blocks; | ||
416 | __u32 used_dirs; | ||
417 | }; | ||
418 | |||
419 | /* | ||
420 | * Helper function for Orlov's allocator; returns critical information | ||
421 | * for a particular block group or flex_bg. If flex_size is 1, then g | ||
422 | * is a block group number; otherwise it is flex_bg number. | ||
423 | */ | ||
424 | void get_orlov_stats(struct super_block *sb, ext4_group_t g, | ||
425 | int flex_size, struct orlov_stats *stats) | ||
426 | { | ||
427 | struct ext4_group_desc *desc; | ||
428 | ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; | ||
429 | int i; | ||
430 | |||
431 | stats->free_inodes = 0; | ||
432 | stats->free_blocks = 0; | ||
433 | stats->used_dirs = 0; | ||
434 | |||
435 | g *= flex_size; | ||
436 | |||
437 | for (i = 0; i < flex_size; i++) { | ||
438 | if (g >= ngroups) | ||
439 | break; | ||
440 | desc = ext4_get_group_desc(sb, g++, NULL); | ||
441 | if (!desc) | ||
442 | continue; | ||
443 | |||
444 | stats->free_inodes += ext4_free_inodes_count(sb, desc); | ||
445 | stats->free_blocks += ext4_free_blks_count(sb, desc); | ||
446 | stats->used_dirs += ext4_used_dirs_count(sb, desc); | ||
447 | } | ||
448 | } | ||
449 | |||
413 | /* | 450 | /* |
414 | * Orlov's allocator for directories. | 451 | * Orlov's allocator for directories. |
415 | * | 452 | * |
@@ -425,35 +462,34 @@ out: | |||
425 | * it has too many directories already (max_dirs) or | 462 | * it has too many directories already (max_dirs) or |
426 | * it has too few free inodes left (min_inodes) or | 463 | * it has too few free inodes left (min_inodes) or |
427 | * it has too few free blocks left (min_blocks) or | 464 | * it has too few free blocks left (min_blocks) or |
428 | * it's already running too large debt (max_debt). | ||
429 | * Parent's group is preferred, if it doesn't satisfy these | 465 | * Parent's group is preferred, if it doesn't satisfy these |
430 | * conditions we search cyclically through the rest. If none | 466 | * conditions we search cyclically through the rest. If none |
431 | * of the groups look good we just look for a group with more | 467 | * of the groups look good we just look for a group with more |
432 | * free inodes than average (starting at parent's group). | 468 | * free inodes than average (starting at parent's group). |
433 | * | ||
434 | * Debt is incremented each time we allocate a directory and decremented | ||
435 | * when we allocate an inode, within 0--255. | ||
436 | */ | 469 | */ |
437 | 470 | ||
438 | #define INODE_COST 64 | ||
439 | #define BLOCK_COST 256 | ||
440 | |||
441 | static int find_group_orlov(struct super_block *sb, struct inode *parent, | 471 | static int find_group_orlov(struct super_block *sb, struct inode *parent, |
442 | ext4_group_t *group) | 472 | ext4_group_t *group, int mode) |
443 | { | 473 | { |
444 | ext4_group_t parent_group = EXT4_I(parent)->i_block_group; | 474 | ext4_group_t parent_group = EXT4_I(parent)->i_block_group; |
445 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 475 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
446 | struct ext4_super_block *es = sbi->s_es; | ||
447 | ext4_group_t ngroups = sbi->s_groups_count; | 476 | ext4_group_t ngroups = sbi->s_groups_count; |
448 | int inodes_per_group = EXT4_INODES_PER_GROUP(sb); | 477 | int inodes_per_group = EXT4_INODES_PER_GROUP(sb); |
449 | unsigned int freei, avefreei; | 478 | unsigned int freei, avefreei; |
450 | ext4_fsblk_t freeb, avefreeb; | 479 | ext4_fsblk_t freeb, avefreeb; |
451 | ext4_fsblk_t blocks_per_dir; | ||
452 | unsigned int ndirs; | 480 | unsigned int ndirs; |
453 | int max_debt, max_dirs, min_inodes; | 481 | int max_dirs, min_inodes; |
454 | ext4_grpblk_t min_blocks; | 482 | ext4_grpblk_t min_blocks; |
455 | ext4_group_t i; | 483 | ext4_group_t i, grp, g; |
456 | struct ext4_group_desc *desc; | 484 | struct ext4_group_desc *desc; |
485 | struct orlov_stats stats; | ||
486 | int flex_size = ext4_flex_bg_size(sbi); | ||
487 | |||
488 | if (flex_size > 1) { | ||
489 | ngroups = (ngroups + flex_size - 1) >> | ||
490 | sbi->s_log_groups_per_flex; | ||
491 | parent_group >>= sbi->s_log_groups_per_flex; | ||
492 | } | ||
457 | 493 | ||
458 | freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); | 494 | freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); |
459 | avefreei = freei / ngroups; | 495 | avefreei = freei / ngroups; |
@@ -462,71 +498,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, | |||
462 | do_div(avefreeb, ngroups); | 498 | do_div(avefreeb, ngroups); |
463 | ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); | 499 | ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); |
464 | 500 | ||
465 | if ((parent == sb->s_root->d_inode) || | 501 | if (S_ISDIR(mode) && |
466 | (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { | 502 | ((parent == sb->s_root->d_inode) || |
503 | (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { | ||
467 | int best_ndir = inodes_per_group; | 504 | int best_ndir = inodes_per_group; |
468 | ext4_group_t grp; | ||
469 | int ret = -1; | 505 | int ret = -1; |
470 | 506 | ||
471 | get_random_bytes(&grp, sizeof(grp)); | 507 | get_random_bytes(&grp, sizeof(grp)); |
472 | parent_group = (unsigned)grp % ngroups; | 508 | parent_group = (unsigned)grp % ngroups; |
473 | for (i = 0; i < ngroups; i++) { | 509 | for (i = 0; i < ngroups; i++) { |
474 | grp = (parent_group + i) % ngroups; | 510 | g = (parent_group + i) % ngroups; |
475 | desc = ext4_get_group_desc(sb, grp, NULL); | 511 | get_orlov_stats(sb, g, flex_size, &stats); |
476 | if (!desc || !ext4_free_inodes_count(sb, desc)) | 512 | if (!stats.free_inodes) |
477 | continue; | 513 | continue; |
478 | if (ext4_used_dirs_count(sb, desc) >= best_ndir) | 514 | if (stats.used_dirs >= best_ndir) |
479 | continue; | 515 | continue; |
480 | if (ext4_free_inodes_count(sb, desc) < avefreei) | 516 | if (stats.free_inodes < avefreei) |
481 | continue; | 517 | continue; |
482 | if (ext4_free_blks_count(sb, desc) < avefreeb) | 518 | if (stats.free_blocks < avefreeb) |
483 | continue; | 519 | continue; |
484 | *group = grp; | 520 | grp = g; |
485 | ret = 0; | 521 | ret = 0; |
486 | best_ndir = ext4_used_dirs_count(sb, desc); | 522 | best_ndir = stats.used_dirs; |
523 | } | ||
524 | if (ret) | ||
525 | goto fallback; | ||
526 | found_flex_bg: | ||
527 | if (flex_size == 1) { | ||
528 | *group = grp; | ||
529 | return 0; | ||
530 | } | ||
531 | |||
532 | /* | ||
533 | * We pack inodes at the beginning of the flexgroup's | ||
534 | * inode tables. Block allocation decisions will do | ||
535 | * something similar, although regular files will | ||
536 | * start at 2nd block group of the flexgroup. See | ||
537 | * ext4_ext_find_goal() and ext4_find_near(). | ||
538 | */ | ||
539 | grp *= flex_size; | ||
540 | for (i = 0; i < flex_size; i++) { | ||
541 | if (grp+i >= sbi->s_groups_count) | ||
542 | break; | ||
543 | desc = ext4_get_group_desc(sb, grp+i, NULL); | ||
544 | if (desc && ext4_free_inodes_count(sb, desc)) { | ||
545 | *group = grp+i; | ||
546 | return 0; | ||
547 | } | ||
487 | } | 548 | } |
488 | if (ret == 0) | ||
489 | return ret; | ||
490 | goto fallback; | 549 | goto fallback; |
491 | } | 550 | } |
492 | 551 | ||
493 | blocks_per_dir = ext4_blocks_count(es) - freeb; | ||
494 | do_div(blocks_per_dir, ndirs); | ||
495 | |||
496 | max_dirs = ndirs / ngroups + inodes_per_group / 16; | 552 | max_dirs = ndirs / ngroups + inodes_per_group / 16; |
497 | min_inodes = avefreei - inodes_per_group / 4; | 553 | min_inodes = avefreei - inodes_per_group*flex_size / 4; |
498 | min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4; | 554 | if (min_inodes < 1) |
499 | 555 | min_inodes = 1; | |
500 | max_debt = EXT4_BLOCKS_PER_GROUP(sb); | 556 | min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; |
501 | max_debt /= max_t(int, blocks_per_dir, BLOCK_COST); | 557 | |
502 | if (max_debt * INODE_COST > inodes_per_group) | 558 | /* |
503 | max_debt = inodes_per_group / INODE_COST; | 559 | * Start looking in the flex group where we last allocated an |
504 | if (max_debt > 255) | 560 | * inode for this parent directory |
505 | max_debt = 255; | 561 | */ |
506 | if (max_debt == 0) | 562 | if (EXT4_I(parent)->i_last_alloc_group != ~0) { |
507 | max_debt = 1; | 563 | parent_group = EXT4_I(parent)->i_last_alloc_group; |
564 | if (flex_size > 1) | ||
565 | parent_group >>= sbi->s_log_groups_per_flex; | ||
566 | } | ||
508 | 567 | ||
509 | for (i = 0; i < ngroups; i++) { | 568 | for (i = 0; i < ngroups; i++) { |
510 | *group = (parent_group + i) % ngroups; | 569 | grp = (parent_group + i) % ngroups; |
511 | desc = ext4_get_group_desc(sb, *group, NULL); | 570 | get_orlov_stats(sb, grp, flex_size, &stats); |
512 | if (!desc || !ext4_free_inodes_count(sb, desc)) | 571 | if (stats.used_dirs >= max_dirs) |
513 | continue; | ||
514 | if (ext4_used_dirs_count(sb, desc) >= max_dirs) | ||
515 | continue; | 572 | continue; |
516 | if (ext4_free_inodes_count(sb, desc) < min_inodes) | 573 | if (stats.free_inodes < min_inodes) |
517 | continue; | 574 | continue; |
518 | if (ext4_free_blks_count(sb, desc) < min_blocks) | 575 | if (stats.free_blocks < min_blocks) |
519 | continue; | 576 | continue; |
520 | return 0; | 577 | goto found_flex_bg; |
521 | } | 578 | } |
522 | 579 | ||
523 | fallback: | 580 | fallback: |
581 | ngroups = sbi->s_groups_count; | ||
582 | avefreei = freei / ngroups; | ||
583 | parent_group = EXT4_I(parent)->i_block_group; | ||
524 | for (i = 0; i < ngroups; i++) { | 584 | for (i = 0; i < ngroups; i++) { |
525 | *group = (parent_group + i) % ngroups; | 585 | grp = (parent_group + i) % ngroups; |
526 | desc = ext4_get_group_desc(sb, *group, NULL); | 586 | desc = ext4_get_group_desc(sb, grp, NULL); |
527 | if (desc && ext4_free_inodes_count(sb, desc) && | 587 | if (desc && ext4_free_inodes_count(sb, desc) && |
528 | ext4_free_inodes_count(sb, desc) >= avefreei) | 588 | ext4_free_inodes_count(sb, desc) >= avefreei) { |
589 | *group = grp; | ||
529 | return 0; | 590 | return 0; |
591 | } | ||
530 | } | 592 | } |
531 | 593 | ||
532 | if (avefreei) { | 594 | if (avefreei) { |
@@ -542,12 +604,51 @@ fallback: | |||
542 | } | 604 | } |
543 | 605 | ||
544 | static int find_group_other(struct super_block *sb, struct inode *parent, | 606 | static int find_group_other(struct super_block *sb, struct inode *parent, |
545 | ext4_group_t *group) | 607 | ext4_group_t *group, int mode) |
546 | { | 608 | { |
547 | ext4_group_t parent_group = EXT4_I(parent)->i_block_group; | 609 | ext4_group_t parent_group = EXT4_I(parent)->i_block_group; |
548 | ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; | 610 | ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; |
549 | struct ext4_group_desc *desc; | 611 | struct ext4_group_desc *desc; |
550 | ext4_group_t i; | 612 | ext4_group_t i, last; |
613 | int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); | ||
614 | |||
615 | /* | ||
616 | * Try to place the inode is the same flex group as its | ||
617 | * parent. If we can't find space, use the Orlov algorithm to | ||
618 | * find another flex group, and store that information in the | ||
619 | * parent directory's inode information so that use that flex | ||
620 | * group for future allocations. | ||
621 | */ | ||
622 | if (flex_size > 1) { | ||
623 | int retry = 0; | ||
624 | |||
625 | try_again: | ||
626 | parent_group &= ~(flex_size-1); | ||
627 | last = parent_group + flex_size; | ||
628 | if (last > ngroups) | ||
629 | last = ngroups; | ||
630 | for (i = parent_group; i < last; i++) { | ||
631 | desc = ext4_get_group_desc(sb, i, NULL); | ||
632 | if (desc && ext4_free_inodes_count(sb, desc)) { | ||
633 | *group = i; | ||
634 | return 0; | ||
635 | } | ||
636 | } | ||
637 | if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { | ||
638 | retry = 1; | ||
639 | parent_group = EXT4_I(parent)->i_last_alloc_group; | ||
640 | goto try_again; | ||
641 | } | ||
642 | /* | ||
643 | * If this didn't work, use the Orlov search algorithm | ||
644 | * to find a new flex group; we pass in the mode to | ||
645 | * avoid the topdir algorithms. | ||
646 | */ | ||
647 | *group = parent_group + flex_size; | ||
648 | if (*group > ngroups) | ||
649 | *group = 0; | ||
650 | return find_group_orlov(sb, parent, group, mode); | ||
651 | } | ||
551 | 652 | ||
552 | /* | 653 | /* |
553 | * Try to place the inode in its parent directory | 654 | * Try to place the inode in its parent directory |
@@ -716,10 +817,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) | |||
716 | sbi = EXT4_SB(sb); | 817 | sbi = EXT4_SB(sb); |
717 | es = sbi->s_es; | 818 | es = sbi->s_es; |
718 | 819 | ||
719 | if (sbi->s_log_groups_per_flex) { | 820 | if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { |
720 | ret2 = find_group_flex(sb, dir, &group); | 821 | ret2 = find_group_flex(sb, dir, &group); |
721 | if (ret2 == -1) { | 822 | if (ret2 == -1) { |
722 | ret2 = find_group_other(sb, dir, &group); | 823 | ret2 = find_group_other(sb, dir, &group, mode); |
723 | if (ret2 == 0 && once) | 824 | if (ret2 == 0 && once) |
724 | once = 0; | 825 | once = 0; |
725 | printk(KERN_NOTICE "ext4: find_group_flex " | 826 | printk(KERN_NOTICE "ext4: find_group_flex " |
@@ -733,11 +834,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) | |||
733 | if (test_opt(sb, OLDALLOC)) | 834 | if (test_opt(sb, OLDALLOC)) |
734 | ret2 = find_group_dir(sb, dir, &group); | 835 | ret2 = find_group_dir(sb, dir, &group); |
735 | else | 836 | else |
736 | ret2 = find_group_orlov(sb, dir, &group); | 837 | ret2 = find_group_orlov(sb, dir, &group, mode); |
737 | } else | 838 | } else |
738 | ret2 = find_group_other(sb, dir, &group); | 839 | ret2 = find_group_other(sb, dir, &group, mode); |
739 | 840 | ||
740 | got_group: | 841 | got_group: |
842 | EXT4_I(dir)->i_last_alloc_group = group; | ||
741 | err = -ENOSPC; | 843 | err = -ENOSPC; |
742 | if (ret2 == -1) | 844 | if (ret2 == -1) |
743 | goto out; | 845 | goto out; |
@@ -894,6 +996,7 @@ got: | |||
894 | ei->i_file_acl = 0; | 996 | ei->i_file_acl = 0; |
895 | ei->i_dtime = 0; | 997 | ei->i_dtime = 0; |
896 | ei->i_block_group = group; | 998 | ei->i_block_group = group; |
999 | ei->i_last_alloc_group = ~0; | ||
897 | 1000 | ||
898 | ext4_set_inode_flags(inode); | 1001 | ext4_set_inode_flags(inode); |
899 | if (IS_DIRSYNC(inode)) | 1002 | if (IS_DIRSYNC(inode)) |