aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/ialloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/ialloc.c')
-rw-r--r--fs/ext4/ialloc.c273
1 files changed, 190 insertions, 83 deletions
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fb51b40e3e8f..47b84e8df568 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
189 struct ext4_super_block *es; 189 struct ext4_super_block *es;
190 struct ext4_sb_info *sbi; 190 struct ext4_sb_info *sbi;
191 int fatal = 0, err, count, cleared; 191 int fatal = 0, err, count, cleared;
192 ext4_group_t flex_group;
193 192
194 if (atomic_read(&inode->i_count) > 1) { 193 if (atomic_read(&inode->i_count) > 1) {
195 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", 194 printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -268,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
268 if (is_directory) { 267 if (is_directory) {
269 count = ext4_used_dirs_count(sb, gdp) - 1; 268 count = ext4_used_dirs_count(sb, gdp) - 1;
270 ext4_used_dirs_set(sb, gdp, count); 269 ext4_used_dirs_set(sb, gdp, count);
270 if (sbi->s_log_groups_per_flex) {
271 ext4_group_t f;
272
273 f = ext4_flex_group(sbi, block_group);
274 atomic_dec(&sbi->s_flex_groups[f].free_inodes);
275 }
276
271 } 277 }
272 gdp->bg_checksum = ext4_group_desc_csum(sbi, 278 gdp->bg_checksum = ext4_group_desc_csum(sbi,
273 block_group, gdp); 279 block_group, gdp);
@@ -277,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
277 percpu_counter_dec(&sbi->s_dirs_counter); 283 percpu_counter_dec(&sbi->s_dirs_counter);
278 284
279 if (sbi->s_log_groups_per_flex) { 285 if (sbi->s_log_groups_per_flex) {
280 flex_group = ext4_flex_group(sbi, block_group); 286 ext4_group_t f;
281 spin_lock(sb_bgl_lock(sbi, flex_group)); 287
282 sbi->s_flex_groups[flex_group].free_inodes++; 288 f = ext4_flex_group(sbi, block_group);
283 spin_unlock(sb_bgl_lock(sbi, flex_group)); 289 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
284 } 290 }
285 } 291 }
286 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); 292 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -360,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
360 sbi->s_log_groups_per_flex; 366 sbi->s_log_groups_per_flex;
361 367
362find_close_to_parent: 368find_close_to_parent:
363 flexbg_free_blocks = flex_group[best_flex].free_blocks; 369 flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
364 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; 370 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
365 if (flex_group[best_flex].free_inodes && 371 if (atomic_read(&flex_group[best_flex].free_inodes) &&
366 flex_freeb_ratio > free_block_ratio) 372 flex_freeb_ratio > free_block_ratio)
367 goto found_flexbg; 373 goto found_flexbg;
368 374
@@ -375,24 +381,24 @@ find_close_to_parent:
375 if (i == parent_fbg_group || i == parent_fbg_group - 1) 381 if (i == parent_fbg_group || i == parent_fbg_group - 1)
376 continue; 382 continue;
377 383
378 flexbg_free_blocks = flex_group[i].free_blocks; 384 flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
379 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; 385 flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
380 386
381 if (flex_freeb_ratio > free_block_ratio && 387 if (flex_freeb_ratio > free_block_ratio &&
382 flex_group[i].free_inodes) { 388 (atomic_read(&flex_group[i].free_inodes))) {
383 best_flex = i; 389 best_flex = i;
384 goto found_flexbg; 390 goto found_flexbg;
385 } 391 }
386 392
387 if (flex_group[best_flex].free_inodes == 0 || 393 if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
388 (flex_group[i].free_blocks > 394 ((atomic_read(&flex_group[i].free_blocks) >
389 flex_group[best_flex].free_blocks && 395 atomic_read(&flex_group[best_flex].free_blocks)) &&
390 flex_group[i].free_inodes)) 396 atomic_read(&flex_group[i].free_inodes)))
391 best_flex = i; 397 best_flex = i;
392 } 398 }
393 399
394 if (!flex_group[best_flex].free_inodes || 400 if (!atomic_read(&flex_group[best_flex].free_inodes) ||
395 !flex_group[best_flex].free_blocks) 401 !atomic_read(&flex_group[best_flex].free_blocks))
396 return -1; 402 return -1;
397 403
398found_flexbg: 404found_flexbg:
@@ -410,6 +416,42 @@ out:
410 return 0; 416 return 0;
411} 417}
412 418
419struct orlov_stats {
420 __u32 free_inodes;
421 __u32 free_blocks;
422 __u32 used_dirs;
423};
424
425/*
426 * Helper function for Orlov's allocator; returns critical information
427 * for a particular block group or flex_bg. If flex_size is 1, then g
428 * is a block group number; otherwise it is flex_bg number.
429 */
430void get_orlov_stats(struct super_block *sb, ext4_group_t g,
431 int flex_size, struct orlov_stats *stats)
432{
433 struct ext4_group_desc *desc;
434 struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
435
436 if (flex_size > 1) {
437 stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
438 stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
439 stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
440 return;
441 }
442
443 desc = ext4_get_group_desc(sb, g, NULL);
444 if (desc) {
445 stats->free_inodes = ext4_free_inodes_count(sb, desc);
446 stats->free_blocks = ext4_free_blks_count(sb, desc);
447 stats->used_dirs = ext4_used_dirs_count(sb, desc);
448 } else {
449 stats->free_inodes = 0;
450 stats->free_blocks = 0;
451 stats->used_dirs = 0;
452 }
453}
454
413/* 455/*
414 * Orlov's allocator for directories. 456 * Orlov's allocator for directories.
415 * 457 *
@@ -425,35 +467,34 @@ out:
425 * it has too many directories already (max_dirs) or 467 * it has too many directories already (max_dirs) or
426 * it has too few free inodes left (min_inodes) or 468 * it has too few free inodes left (min_inodes) or
427 * it has too few free blocks left (min_blocks) or 469 * it has too few free blocks left (min_blocks) or
428 * it's already running too large debt (max_debt).
429 * Parent's group is preferred, if it doesn't satisfy these 470 * Parent's group is preferred, if it doesn't satisfy these
430 * conditions we search cyclically through the rest. If none 471 * conditions we search cyclically through the rest. If none
431 * of the groups look good we just look for a group with more 472 * of the groups look good we just look for a group with more
432 * free inodes than average (starting at parent's group). 473 * free inodes than average (starting at parent's group).
433 *
434 * Debt is incremented each time we allocate a directory and decremented
435 * when we allocate an inode, within 0--255.
436 */ 474 */
437 475
438#define INODE_COST 64
439#define BLOCK_COST 256
440
441static int find_group_orlov(struct super_block *sb, struct inode *parent, 476static int find_group_orlov(struct super_block *sb, struct inode *parent,
442 ext4_group_t *group) 477 ext4_group_t *group, int mode)
443{ 478{
444 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 479 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
445 struct ext4_sb_info *sbi = EXT4_SB(sb); 480 struct ext4_sb_info *sbi = EXT4_SB(sb);
446 struct ext4_super_block *es = sbi->s_es;
447 ext4_group_t ngroups = sbi->s_groups_count; 481 ext4_group_t ngroups = sbi->s_groups_count;
448 int inodes_per_group = EXT4_INODES_PER_GROUP(sb); 482 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
449 unsigned int freei, avefreei; 483 unsigned int freei, avefreei;
450 ext4_fsblk_t freeb, avefreeb; 484 ext4_fsblk_t freeb, avefreeb;
451 ext4_fsblk_t blocks_per_dir;
452 unsigned int ndirs; 485 unsigned int ndirs;
453 int max_debt, max_dirs, min_inodes; 486 int max_dirs, min_inodes;
454 ext4_grpblk_t min_blocks; 487 ext4_grpblk_t min_blocks;
455 ext4_group_t i; 488 ext4_group_t i, grp, g;
456 struct ext4_group_desc *desc; 489 struct ext4_group_desc *desc;
490 struct orlov_stats stats;
491 int flex_size = ext4_flex_bg_size(sbi);
492
493 if (flex_size > 1) {
494 ngroups = (ngroups + flex_size - 1) >>
495 sbi->s_log_groups_per_flex;
496 parent_group >>= sbi->s_log_groups_per_flex;
497 }
457 498
458 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); 499 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
459 avefreei = freei / ngroups; 500 avefreei = freei / ngroups;
@@ -462,71 +503,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
462 do_div(avefreeb, ngroups); 503 do_div(avefreeb, ngroups);
463 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); 504 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
464 505
465 if ((parent == sb->s_root->d_inode) || 506 if (S_ISDIR(mode) &&
466 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) { 507 ((parent == sb->s_root->d_inode) ||
508 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
467 int best_ndir = inodes_per_group; 509 int best_ndir = inodes_per_group;
468 ext4_group_t grp;
469 int ret = -1; 510 int ret = -1;
470 511
471 get_random_bytes(&grp, sizeof(grp)); 512 get_random_bytes(&grp, sizeof(grp));
472 parent_group = (unsigned)grp % ngroups; 513 parent_group = (unsigned)grp % ngroups;
473 for (i = 0; i < ngroups; i++) { 514 for (i = 0; i < ngroups; i++) {
474 grp = (parent_group + i) % ngroups; 515 g = (parent_group + i) % ngroups;
475 desc = ext4_get_group_desc(sb, grp, NULL); 516 get_orlov_stats(sb, g, flex_size, &stats);
476 if (!desc || !ext4_free_inodes_count(sb, desc)) 517 if (!stats.free_inodes)
477 continue; 518 continue;
478 if (ext4_used_dirs_count(sb, desc) >= best_ndir) 519 if (stats.used_dirs >= best_ndir)
479 continue; 520 continue;
480 if (ext4_free_inodes_count(sb, desc) < avefreei) 521 if (stats.free_inodes < avefreei)
481 continue; 522 continue;
482 if (ext4_free_blks_count(sb, desc) < avefreeb) 523 if (stats.free_blocks < avefreeb)
483 continue; 524 continue;
484 *group = grp; 525 grp = g;
485 ret = 0; 526 ret = 0;
486 best_ndir = ext4_used_dirs_count(sb, desc); 527 best_ndir = stats.used_dirs;
528 }
529 if (ret)
530 goto fallback;
531 found_flex_bg:
532 if (flex_size == 1) {
533 *group = grp;
534 return 0;
535 }
536
537 /*
538 * We pack inodes at the beginning of the flexgroup's
539 * inode tables. Block allocation decisions will do
540 * something similar, although regular files will
541 * start at 2nd block group of the flexgroup. See
542 * ext4_ext_find_goal() and ext4_find_near().
543 */
544 grp *= flex_size;
545 for (i = 0; i < flex_size; i++) {
546 if (grp+i >= sbi->s_groups_count)
547 break;
548 desc = ext4_get_group_desc(sb, grp+i, NULL);
549 if (desc && ext4_free_inodes_count(sb, desc)) {
550 *group = grp+i;
551 return 0;
552 }
487 } 553 }
488 if (ret == 0)
489 return ret;
490 goto fallback; 554 goto fallback;
491 } 555 }
492 556
493 blocks_per_dir = ext4_blocks_count(es) - freeb;
494 do_div(blocks_per_dir, ndirs);
495
496 max_dirs = ndirs / ngroups + inodes_per_group / 16; 557 max_dirs = ndirs / ngroups + inodes_per_group / 16;
497 min_inodes = avefreei - inodes_per_group / 4; 558 min_inodes = avefreei - inodes_per_group*flex_size / 4;
498 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4; 559 if (min_inodes < 1)
499 560 min_inodes = 1;
500 max_debt = EXT4_BLOCKS_PER_GROUP(sb); 561 min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
501 max_debt /= max_t(int, blocks_per_dir, BLOCK_COST); 562
502 if (max_debt * INODE_COST > inodes_per_group) 563 /*
503 max_debt = inodes_per_group / INODE_COST; 564 * Start looking in the flex group where we last allocated an
504 if (max_debt > 255) 565 * inode for this parent directory
505 max_debt = 255; 566 */
506 if (max_debt == 0) 567 if (EXT4_I(parent)->i_last_alloc_group != ~0) {
507 max_debt = 1; 568 parent_group = EXT4_I(parent)->i_last_alloc_group;
569 if (flex_size > 1)
570 parent_group >>= sbi->s_log_groups_per_flex;
571 }
508 572
509 for (i = 0; i < ngroups; i++) { 573 for (i = 0; i < ngroups; i++) {
510 *group = (parent_group + i) % ngroups; 574 grp = (parent_group + i) % ngroups;
511 desc = ext4_get_group_desc(sb, *group, NULL); 575 get_orlov_stats(sb, grp, flex_size, &stats);
512 if (!desc || !ext4_free_inodes_count(sb, desc)) 576 if (stats.used_dirs >= max_dirs)
513 continue;
514 if (ext4_used_dirs_count(sb, desc) >= max_dirs)
515 continue; 577 continue;
516 if (ext4_free_inodes_count(sb, desc) < min_inodes) 578 if (stats.free_inodes < min_inodes)
517 continue; 579 continue;
518 if (ext4_free_blks_count(sb, desc) < min_blocks) 580 if (stats.free_blocks < min_blocks)
519 continue; 581 continue;
520 return 0; 582 goto found_flex_bg;
521 } 583 }
522 584
523fallback: 585fallback:
586 ngroups = sbi->s_groups_count;
587 avefreei = freei / ngroups;
588 parent_group = EXT4_I(parent)->i_block_group;
524 for (i = 0; i < ngroups; i++) { 589 for (i = 0; i < ngroups; i++) {
525 *group = (parent_group + i) % ngroups; 590 grp = (parent_group + i) % ngroups;
526 desc = ext4_get_group_desc(sb, *group, NULL); 591 desc = ext4_get_group_desc(sb, grp, NULL);
527 if (desc && ext4_free_inodes_count(sb, desc) && 592 if (desc && ext4_free_inodes_count(sb, desc) &&
528 ext4_free_inodes_count(sb, desc) >= avefreei) 593 ext4_free_inodes_count(sb, desc) >= avefreei) {
594 *group = grp;
529 return 0; 595 return 0;
596 }
530 } 597 }
531 598
532 if (avefreei) { 599 if (avefreei) {
@@ -542,12 +609,51 @@ fallback:
542} 609}
543 610
544static int find_group_other(struct super_block *sb, struct inode *parent, 611static int find_group_other(struct super_block *sb, struct inode *parent,
545 ext4_group_t *group) 612 ext4_group_t *group, int mode)
546{ 613{
547 ext4_group_t parent_group = EXT4_I(parent)->i_block_group; 614 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
548 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 615 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
549 struct ext4_group_desc *desc; 616 struct ext4_group_desc *desc;
550 ext4_group_t i; 617 ext4_group_t i, last;
618 int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
619
620 /*
621 * Try to place the inode is the same flex group as its
622 * parent. If we can't find space, use the Orlov algorithm to
623 * find another flex group, and store that information in the
624 * parent directory's inode information so that use that flex
625 * group for future allocations.
626 */
627 if (flex_size > 1) {
628 int retry = 0;
629
630 try_again:
631 parent_group &= ~(flex_size-1);
632 last = parent_group + flex_size;
633 if (last > ngroups)
634 last = ngroups;
635 for (i = parent_group; i < last; i++) {
636 desc = ext4_get_group_desc(sb, i, NULL);
637 if (desc && ext4_free_inodes_count(sb, desc)) {
638 *group = i;
639 return 0;
640 }
641 }
642 if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
643 retry = 1;
644 parent_group = EXT4_I(parent)->i_last_alloc_group;
645 goto try_again;
646 }
647 /*
648 * If this didn't work, use the Orlov search algorithm
649 * to find a new flex group; we pass in the mode to
650 * avoid the topdir algorithms.
651 */
652 *group = parent_group + flex_size;
653 if (*group > ngroups)
654 *group = 0;
655 return find_group_orlov(sb, parent, group, mode);
656 }
551 657
552 /* 658 /*
553 * Try to place the inode in its parent directory 659 * Try to place the inode in its parent directory
@@ -665,6 +771,11 @@ static int ext4_claim_inode(struct super_block *sb,
665 if (S_ISDIR(mode)) { 771 if (S_ISDIR(mode)) {
666 count = ext4_used_dirs_count(sb, gdp) + 1; 772 count = ext4_used_dirs_count(sb, gdp) + 1;
667 ext4_used_dirs_set(sb, gdp, count); 773 ext4_used_dirs_set(sb, gdp, count);
774 if (sbi->s_log_groups_per_flex) {
775 ext4_group_t f = ext4_flex_group(sbi, group);
776
777 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
778 }
668 } 779 }
669 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); 780 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
670err_ret: 781err_ret:
@@ -716,10 +827,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
716 sbi = EXT4_SB(sb); 827 sbi = EXT4_SB(sb);
717 es = sbi->s_es; 828 es = sbi->s_es;
718 829
719 if (sbi->s_log_groups_per_flex) { 830 if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
720 ret2 = find_group_flex(sb, dir, &group); 831 ret2 = find_group_flex(sb, dir, &group);
721 if (ret2 == -1) { 832 if (ret2 == -1) {
722 ret2 = find_group_other(sb, dir, &group); 833 ret2 = find_group_other(sb, dir, &group, mode);
723 if (ret2 == 0 && once) 834 if (ret2 == 0 && once)
724 once = 0; 835 once = 0;
725 printk(KERN_NOTICE "ext4: find_group_flex " 836 printk(KERN_NOTICE "ext4: find_group_flex "
@@ -733,11 +844,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
733 if (test_opt(sb, OLDALLOC)) 844 if (test_opt(sb, OLDALLOC))
734 ret2 = find_group_dir(sb, dir, &group); 845 ret2 = find_group_dir(sb, dir, &group);
735 else 846 else
736 ret2 = find_group_orlov(sb, dir, &group); 847 ret2 = find_group_orlov(sb, dir, &group, mode);
737 } else 848 } else
738 ret2 = find_group_other(sb, dir, &group); 849 ret2 = find_group_other(sb, dir, &group, mode);
739 850
740got_group: 851got_group:
852 EXT4_I(dir)->i_last_alloc_group = group;
741 err = -ENOSPC; 853 err = -ENOSPC;
742 if (ret2 == -1) 854 if (ret2 == -1)
743 goto out; 855 goto out;
@@ -858,9 +970,7 @@ got:
858 970
859 if (sbi->s_log_groups_per_flex) { 971 if (sbi->s_log_groups_per_flex) {
860 flex_group = ext4_flex_group(sbi, group); 972 flex_group = ext4_flex_group(sbi, group);
861 spin_lock(sb_bgl_lock(sbi, flex_group)); 973 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
862 sbi->s_flex_groups[flex_group].free_inodes--;
863 spin_unlock(sb_bgl_lock(sbi, flex_group));
864 } 974 }
865 975
866 inode->i_uid = current_fsuid(); 976 inode->i_uid = current_fsuid();
@@ -885,19 +995,16 @@ got:
885 ei->i_disksize = 0; 995 ei->i_disksize = 0;
886 996
887 /* 997 /*
888 * Don't inherit extent flag from directory. We set extent flag on 998 * Don't inherit extent flag from directory, amongst others. We set
889 * newly created directory and file only if -o extent mount option is 999 * extent flag on newly created directory and file only if -o extent
890 * specified 1000 * mount option is specified
891 */ 1001 */
892 ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL); 1002 ei->i_flags =
893 if (S_ISLNK(mode)) 1003 ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
894 ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
895 /* dirsync only applies to directories */
896 if (!S_ISDIR(mode))
897 ei->i_flags &= ~EXT4_DIRSYNC_FL;
898 ei->i_file_acl = 0; 1004 ei->i_file_acl = 0;
899 ei->i_dtime = 0; 1005 ei->i_dtime = 0;
900 ei->i_block_group = group; 1006 ei->i_block_group = group;
1007 ei->i_last_alloc_group = ~0;
901 1008
902 ext4_set_inode_flags(inode); 1009 ext4_set_inode_flags(inode);
903 if (IS_DIRSYNC(inode)) 1010 if (IS_DIRSYNC(inode))