aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorCurt Wohlgemuth <curtw@google.com>2010-05-16 15:00:00 -0400
committerTheodore Ts'o <tytso@mit.edu>2010-05-16 15:00:00 -0400
commit8a57d9d61a6e361c7bb159dda797672c1df1a691 (patch)
tree39a01022ed2294f0acc94b45554c9a292db671dc /fs
parent6d19c42b7cf81c39632b6d4dbc514e8449bcd346 (diff)
ext4: check for a good block group before loading buddy pages
This adds a new field in ext4_group_info to cache the largest available block range in a block group; and don't load the buddy pages until *after* we've done a sanity check on the block group. With large allocation requests (e.g., fallocate(), 8MiB) and relatively full partitions, it's easy to have no block groups with a block extent large enough to satisfy the input request length. This currently causes the loop during cr == 0 in ext4_mb_regular_allocator() to load the buddy bitmap pages for EVERY block group. That can be a lot of pages. The patch below allows us to call ext4_mb_good_group() BEFORE we load the buddy pages (although we have check again after we lock the block group). Addresses-Google-Bug: #2578108 Addresses-Google-Bug: #2704453 Signed-off-by: Curt Wohlgemuth <curtw@google.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs')
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/mballoc.c70
2 files changed, 58 insertions, 13 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf7c5f0..d266003cac3e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1678,6 +1678,7 @@ struct ext4_group_info {
1678 ext4_grpblk_t bb_first_free; /* first free block */ 1678 ext4_grpblk_t bb_first_free; /* first free block */
1679 ext4_grpblk_t bb_free; /* total free blocks */ 1679 ext4_grpblk_t bb_free; /* total free blocks */
1680 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ 1680 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
1681 ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
1681 struct list_head bb_prealloc_list; 1682 struct list_head bb_prealloc_list;
1682#ifdef DOUBLE_CHECK 1683#ifdef DOUBLE_CHECK
1683 void *bb_bitmap; 1684 void *bb_bitmap;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4f2d3a9d4e21..aa499fe11687 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
658 } 658 }
659} 659}
660 660
661/*
662 * Cache the order of the largest free extent we have available in this block
663 * group.
664 */
665static void
666mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
667{
668 int i;
669 int bits;
670
671 grp->bb_largest_free_order = -1; /* uninit */
672
673 bits = sb->s_blocksize_bits + 1;
674 for (i = bits; i >= 0; i--) {
675 if (grp->bb_counters[i] > 0) {
676 grp->bb_largest_free_order = i;
677 break;
678 }
679 }
680}
681
661static noinline_for_stack 682static noinline_for_stack
662void ext4_mb_generate_buddy(struct super_block *sb, 683void ext4_mb_generate_buddy(struct super_block *sb,
663 void *buddy, void *bitmap, ext4_group_t group) 684 void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
700 */ 721 */
701 grp->bb_free = free; 722 grp->bb_free = free;
702 } 723 }
724 mb_set_largest_free_order(sb, grp);
703 725
704 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 726 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
705 727
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
725 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. 747 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
726 * So it can have information regarding groups_per_page which 748 * So it can have information regarding groups_per_page which
727 * is blocks_per_page/2 749 * is blocks_per_page/2
750 *
751 * Locking note: This routine takes the block group lock of all groups
752 * for this page; do not hold this lock when calling this routine!
728 */ 753 */
729 754
730static int ext4_mb_init_cache(struct page *page, char *incore) 755static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -910,6 +935,11 @@ out:
910 return err; 935 return err;
911} 936}
912 937
938/*
939 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
940 * block group lock of all groups for this page; do not hold the BG lock when
941 * calling this routine!
942 */
913static noinline_for_stack 943static noinline_for_stack
914int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 944int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
915{ 945{
@@ -1004,6 +1034,11 @@ err:
1004 return ret; 1034 return ret;
1005} 1035}
1006 1036
1037/*
1038 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1039 * block group lock of all groups for this page; do not hold the BG lock when
1040 * calling this routine!
1041 */
1007static noinline_for_stack int 1042static noinline_for_stack int
1008ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1043ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1009 struct ext4_buddy *e4b) 1044 struct ext4_buddy *e4b)
@@ -1299,6 +1334,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1299 buddy = buddy2; 1334 buddy = buddy2;
1300 } while (1); 1335 } while (1);
1301 } 1336 }
1337 mb_set_largest_free_order(sb, e4b->bd_info);
1302 mb_check_buddy(e4b); 1338 mb_check_buddy(e4b);
1303} 1339}
1304 1340
@@ -1427,6 +1463,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1427 e4b->bd_info->bb_counters[ord]++; 1463 e4b->bd_info->bb_counters[ord]++;
1428 e4b->bd_info->bb_counters[ord]++; 1464 e4b->bd_info->bb_counters[ord]++;
1429 } 1465 }
1466 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1430 1467
1431 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1468 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1432 mb_check_buddy(e4b); 1469 mb_check_buddy(e4b);
@@ -1821,16 +1858,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1821 } 1858 }
1822} 1859}
1823 1860
1861/* This is now called BEFORE we load the buddy bitmap. */
1824static int ext4_mb_good_group(struct ext4_allocation_context *ac, 1862static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1825 ext4_group_t group, int cr) 1863 ext4_group_t group, int cr)
1826{ 1864{
1827 unsigned free, fragments; 1865 unsigned free, fragments;
1828 unsigned i, bits;
1829 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 1866 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1830 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1867 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1831 1868
1832 BUG_ON(cr < 0 || cr >= 4); 1869 BUG_ON(cr < 0 || cr >= 4);
1833 BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); 1870
1871 /* We only do this if the grp has never been initialized */
1872 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1873 int ret = ext4_mb_init_group(ac->ac_sb, group);
1874 if (ret)
1875 return 0;
1876 }
1834 1877
1835 free = grp->bb_free; 1878 free = grp->bb_free;
1836 fragments = grp->bb_fragments; 1879 fragments = grp->bb_fragments;
@@ -1843,17 +1886,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1843 case 0: 1886 case 0:
1844 BUG_ON(ac->ac_2order == 0); 1887 BUG_ON(ac->ac_2order == 0);
1845 1888
1889 if (grp->bb_largest_free_order < ac->ac_2order)
1890 return 0;
1891
1846 /* Avoid using the first bg of a flexgroup for data files */ 1892 /* Avoid using the first bg of a flexgroup for data files */
1847 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 1893 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
1848 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 1894 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
1849 ((group % flex_size) == 0)) 1895 ((group % flex_size) == 0))
1850 return 0; 1896 return 0;
1851 1897
1852 bits = ac->ac_sb->s_blocksize_bits + 1; 1898 return 1;
1853 for (i = ac->ac_2order; i <= bits; i++)
1854 if (grp->bb_counters[i] > 0)
1855 return 1;
1856 break;
1857 case 1: 1899 case 1:
1858 if ((free / fragments) >= ac->ac_g_ex.fe_len) 1900 if ((free / fragments) >= ac->ac_g_ex.fe_len)
1859 return 1; 1901 return 1;
@@ -2024,14 +2066,11 @@ repeat:
2024 group = ac->ac_g_ex.fe_group; 2066 group = ac->ac_g_ex.fe_group;
2025 2067
2026 for (i = 0; i < ngroups; group++, i++) { 2068 for (i = 0; i < ngroups; group++, i++) {
2027 struct ext4_group_info *grp;
2028
2029 if (group == ngroups) 2069 if (group == ngroups)
2030 group = 0; 2070 group = 0;
2031 2071
2032 /* quick check to skip empty groups */ 2072 /* This now checks without needing the buddy page */
2033 grp = ext4_get_group_info(sb, group); 2073 if (!ext4_mb_good_group(ac, group, cr))
2034 if (grp->bb_free == 0)
2035 continue; 2074 continue;
2036 2075
2037 err = ext4_mb_load_buddy(sb, group, &e4b); 2076 err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2039,8 +2078,12 @@ repeat:
2039 goto out; 2078 goto out;
2040 2079
2041 ext4_lock_group(sb, group); 2080 ext4_lock_group(sb, group);
2081
2082 /*
2083 * We need to check again after locking the
2084 * block group
2085 */
2042 if (!ext4_mb_good_group(ac, group, cr)) { 2086 if (!ext4_mb_good_group(ac, group, cr)) {
2043 /* someone did allocation from this group */
2044 ext4_unlock_group(sb, group); 2087 ext4_unlock_group(sb, group);
2045 ext4_mb_unload_buddy(&e4b); 2088 ext4_mb_unload_buddy(&e4b);
2046 continue; 2089 continue;
@@ -2253,6 +2296,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2253 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2296 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2254 init_rwsem(&meta_group_info[i]->alloc_sem); 2297 init_rwsem(&meta_group_info[i]->alloc_sem);
2255 meta_group_info[i]->bb_free_root = RB_ROOT; 2298 meta_group_info[i]->bb_free_root = RB_ROOT;
2299 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
2256 2300
2257#ifdef DOUBLE_CHECK 2301#ifdef DOUBLE_CHECK
2258 { 2302 {