aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/ext4.txt10
-rw-r--r--Documentation/filesystems/proc.txt39
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/balloc.c67
-rw-r--r--fs/ext4/extents.c45
-rw-r--r--fs/ext4/inode.c15
-rw-r--r--fs/ext4/mballoc.c4552
-rw-r--r--fs/ext4/migrate.c10
-rw-r--r--fs/ext4/super.c62
-rw-r--r--fs/ext4/xattr.c4
-rw-r--r--include/linux/ext4_fs.h76
-rw-r--r--include/linux/ext4_fs_i.h4
-rw-r--r--include/linux/ext4_fs_sb.h52
13 files changed, 4900 insertions, 38 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 4f329afe20ec..560f88dc7090 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -86,9 +86,11 @@ Alex is working on a new set of patches right now.
86When mounting an ext4 filesystem, the following option are accepted: 86When mounting an ext4 filesystem, the following option are accepted:
87(*) == default 87(*) == default
88 88
89extents ext4 will use extents to address file data. The 89extents (*) ext4 will use extents to address file data. The
90 file system will no longer be mountable by ext3. 90 file system will no longer be mountable by ext3.
91 91
92noextents ext4 will not use extents for newly created files
93
92journal_checksum Enable checksumming of the journal transactions. 94journal_checksum Enable checksumming of the journal transactions.
93 This will allow the recovery code in e2fsck and the 95 This will allow the recovery code in e2fsck and the
94 kernel to detect corruption in the kernel. It is a 96 kernel to detect corruption in the kernel. It is a
@@ -206,6 +208,12 @@ nobh (a) cache disk block mapping information
206 "nobh" option tries to avoid associating buffer 208 "nobh" option tries to avoid associating buffer
207 heads (supported only for "writeback" mode). 209 heads (supported only for "writeback" mode).
208 210
211mballoc (*) Use the multiple block allocator for block allocation
212nomballoc disabled multiple block allocator for block allocation.
213stripe=n Number of filesystem blocks that mballoc will try
214 to use for allocation size and alignment. For RAID5/6
215 systems this should be the number of data
216 disks * RAID chunk size in file system blocks.
209 217
210Data Mode 218Data Mode
211--------- 219---------
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index dec99455321f..4413a2d4646f 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -857,6 +857,45 @@ CPUs.
857The "procs_blocked" line gives the number of processes currently blocked, 857The "procs_blocked" line gives the number of processes currently blocked,
858waiting for I/O to complete. 858waiting for I/O to complete.
859 859
8601.9 Ext4 file system parameters
861------------------------------
862Ext4 file system have one directory per partition under /proc/fs/ext4/
863# ls /proc/fs/ext4/hdc/
864group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req
865stats stream_req
866
867mb_groups:
868This file gives the details of mutiblock allocator buddy cache of free blocks
869
870mb_history:
871Multiblock allocation history.
872
873stats:
874This file indicate whether the multiblock allocator should start collecting
875statistics. The statistics are shown during unmount
876
877group_prealloc:
878The multiblock allocator normalize the block allocation request to
879group_prealloc filesystem blocks if we don't have strip value set.
880The stripe value can be specified at mount time or during mke2fs.
881
882max_to_scan:
883How long multiblock allocator can look for a best extent (in found extents)
884
885min_to_scan:
886How long multiblock allocator must look for a best extent
887
888order2_req:
889Multiblock allocator use 2^N search using buddies only for requests greater
890than or equal to order2_req. The request size is specfied in file system
891blocks. A value of 2 indicate only if the requests are greater than or equal
892to 4 blocks.
893
894stream_req:
895Files smaller than stream_req are served by the stream allocator, whose
896purpose is to pack requests as close each to other as possible to
897produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16
898filesystem block size will use group based preallocation.
860 899
861------------------------------------------------------------------------------ 900------------------------------------------------------------------------------
862Summary 901Summary
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index d5fd80bc0d04..ac6fa8ca0a2f 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
6 6
7ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ 7ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o 9 ext4_jbd2.o migrate.o mballoc.o
10 10
11ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 11ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
12ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o 12ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 80a4616c8244..ac75ea953d83 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -577,6 +577,8 @@ void ext4_discard_reservation(struct inode *inode)
577 struct ext4_reserve_window_node *rsv; 577 struct ext4_reserve_window_node *rsv;
578 spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock; 578 spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
579 579
580 ext4_mb_discard_inode_preallocations(inode);
581
580 if (!block_i) 582 if (!block_i)
581 return; 583 return;
582 584
@@ -785,19 +787,29 @@ error_return:
785 * @inode: inode 787 * @inode: inode
786 * @block: start physical block to free 788 * @block: start physical block to free
787 * @count: number of blocks to count 789 * @count: number of blocks to count
790 * @metadata: Are these metadata blocks
788 */ 791 */
789void ext4_free_blocks(handle_t *handle, struct inode *inode, 792void ext4_free_blocks(handle_t *handle, struct inode *inode,
790 ext4_fsblk_t block, unsigned long count) 793 ext4_fsblk_t block, unsigned long count,
794 int metadata)
791{ 795{
792 struct super_block * sb; 796 struct super_block * sb;
793 unsigned long dquot_freed_blocks; 797 unsigned long dquot_freed_blocks;
794 798
799 /* this isn't the right place to decide whether block is metadata
800 * inode.c/extents.c knows better, but for safety ... */
801 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
802 ext4_should_journal_data(inode))
803 metadata = 1;
804
795 sb = inode->i_sb; 805 sb = inode->i_sb;
796 if (!sb) { 806
797 printk ("ext4_free_blocks: nonexistent device"); 807 if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
798 return; 808 ext4_free_blocks_sb(handle, sb, block, count,
799 } 809 &dquot_freed_blocks);
800 ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); 810 else
811 ext4_mb_free_blocks(handle, inode, block, count,
812 metadata, &dquot_freed_blocks);
801 if (dquot_freed_blocks) 813 if (dquot_freed_blocks)
802 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); 814 DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
803 return; 815 return;
@@ -1576,7 +1588,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1576} 1588}
1577 1589
1578/** 1590/**
1579 * ext4_new_blocks() -- core block(s) allocation function 1591 * ext4_new_blocks_old() -- core block(s) allocation function
1580 * @handle: handle to this transaction 1592 * @handle: handle to this transaction
1581 * @inode: file inode 1593 * @inode: file inode
1582 * @goal: given target block(filesystem wide) 1594 * @goal: given target block(filesystem wide)
@@ -1589,7 +1601,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
1589 * any specific goal block. 1601 * any specific goal block.
1590 * 1602 *
1591 */ 1603 */
1592ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, 1604ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
1593 ext4_fsblk_t goal, unsigned long *count, int *errp) 1605 ext4_fsblk_t goal, unsigned long *count, int *errp)
1594{ 1606{
1595 struct buffer_head *bitmap_bh = NULL; 1607 struct buffer_head *bitmap_bh = NULL;
@@ -1849,13 +1861,46 @@ out:
1849} 1861}
1850 1862
1851ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, 1863ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
1852 ext4_fsblk_t goal, int *errp) 1864 ext4_fsblk_t goal, int *errp)
1865{
1866 struct ext4_allocation_request ar;
1867 ext4_fsblk_t ret;
1868
1869 if (!test_opt(inode->i_sb, MBALLOC)) {
1870 unsigned long count = 1;
1871 ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
1872 return ret;
1873 }
1874
1875 memset(&ar, 0, sizeof(ar));
1876 ar.inode = inode;
1877 ar.goal = goal;
1878 ar.len = 1;
1879 ret = ext4_mb_new_blocks(handle, &ar, errp);
1880 return ret;
1881}
1882
1883ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1884 ext4_fsblk_t goal, unsigned long *count, int *errp)
1853{ 1885{
1854 unsigned long count = 1; 1886 struct ext4_allocation_request ar;
1887 ext4_fsblk_t ret;
1855 1888
1856 return ext4_new_blocks(handle, inode, goal, &count, errp); 1889 if (!test_opt(inode->i_sb, MBALLOC)) {
1890 ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
1891 return ret;
1892 }
1893
1894 memset(&ar, 0, sizeof(ar));
1895 ar.inode = inode;
1896 ar.goal = goal;
1897 ar.len = *count;
1898 ret = ext4_mb_new_blocks(handle, &ar, errp);
1899 *count = ar.len;
1900 return ret;
1857} 1901}
1858 1902
1903
1859/** 1904/**
1860 * ext4_count_free_blocks() -- count filesystem free blocks 1905 * ext4_count_free_blocks() -- count filesystem free blocks
1861 * @sb: superblock 1906 * @sb: superblock
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index f5cf2a94b6fc..0cffb59fff46 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -853,7 +853,7 @@ cleanup:
853 for (i = 0; i < depth; i++) { 853 for (i = 0; i < depth; i++) {
854 if (!ablocks[i]) 854 if (!ablocks[i])
855 continue; 855 continue;
856 ext4_free_blocks(handle, inode, ablocks[i], 1); 856 ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
857 } 857 }
858 } 858 }
859 kfree(ablocks); 859 kfree(ablocks);
@@ -1698,7 +1698,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1698 ext_debug("index is empty, remove it, free block %llu\n", leaf); 1698 ext_debug("index is empty, remove it, free block %llu\n", leaf);
1699 bh = sb_find_get_block(inode->i_sb, leaf); 1699 bh = sb_find_get_block(inode->i_sb, leaf);
1700 ext4_forget(handle, 1, inode, bh, leaf); 1700 ext4_forget(handle, 1, inode, bh, leaf);
1701 ext4_free_blocks(handle, inode, leaf, 1); 1701 ext4_free_blocks(handle, inode, leaf, 1, 1);
1702 return err; 1702 return err;
1703} 1703}
1704 1704
@@ -1759,8 +1759,10 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1759{ 1759{
1760 struct buffer_head *bh; 1760 struct buffer_head *bh;
1761 unsigned short ee_len = ext4_ext_get_actual_len(ex); 1761 unsigned short ee_len = ext4_ext_get_actual_len(ex);
1762 int i; 1762 int i, metadata = 0;
1763 1763
1764 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1765 metadata = 1;
1764#ifdef EXTENTS_STATS 1766#ifdef EXTENTS_STATS
1765 { 1767 {
1766 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1768 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1789,7 +1791,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1789 bh = sb_find_get_block(inode->i_sb, start + i); 1791 bh = sb_find_get_block(inode->i_sb, start + i);
1790 ext4_forget(handle, 0, inode, bh, start + i); 1792 ext4_forget(handle, 0, inode, bh, start + i);
1791 } 1793 }
1792 ext4_free_blocks(handle, inode, start, num); 1794 ext4_free_blocks(handle, inode, start, num, metadata);
1793 } else if (from == le32_to_cpu(ex->ee_block) 1795 } else if (from == le32_to_cpu(ex->ee_block)
1794 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 1796 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
1795 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 1797 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2287,6 +2289,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2287 ext4_fsblk_t goal, newblock; 2289 ext4_fsblk_t goal, newblock;
2288 int err = 0, depth, ret; 2290 int err = 0, depth, ret;
2289 unsigned long allocated = 0; 2291 unsigned long allocated = 0;
2292 struct ext4_allocation_request ar;
2290 2293
2291 __clear_bit(BH_New, &bh_result->b_state); 2294 __clear_bit(BH_New, &bh_result->b_state);
2292 ext_debug("blocks %u/%lu requested for inode %u\n", 2295 ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2397,8 +2400,15 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2397 if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info)) 2400 if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
2398 ext4_init_block_alloc_info(inode); 2401 ext4_init_block_alloc_info(inode);
2399 2402
2400 /* allocate new block */ 2403 /* find neighbour allocated blocks */
2401 goal = ext4_ext_find_goal(inode, path, iblock); 2404 ar.lleft = iblock;
2405 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
2406 if (err)
2407 goto out2;
2408 ar.lright = iblock;
2409 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
2410 if (err)
2411 goto out2;
2402 2412
2403 /* 2413 /*
2404 * See if request is beyond maximum number of blocks we can have in 2414 * See if request is beyond maximum number of blocks we can have in
@@ -2421,7 +2431,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2421 allocated = le16_to_cpu(newex.ee_len); 2431 allocated = le16_to_cpu(newex.ee_len);
2422 else 2432 else
2423 allocated = max_blocks; 2433 allocated = max_blocks;
2424 newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err); 2434
2435 /* allocate new block */
2436 ar.inode = inode;
2437 ar.goal = ext4_ext_find_goal(inode, path, iblock);
2438 ar.logical = iblock;
2439 ar.len = allocated;
2440 if (S_ISREG(inode->i_mode))
2441 ar.flags = EXT4_MB_HINT_DATA;
2442 else
2443 /* disable in-core preallocation for non-regular files */
2444 ar.flags = 0;
2445 newblock = ext4_mb_new_blocks(handle, &ar, &err);
2425 if (!newblock) 2446 if (!newblock)
2426 goto out2; 2447 goto out2;
2427 ext_debug("allocate new block: goal %llu, found %llu/%lu\n", 2448 ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
@@ -2429,14 +2450,17 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2429 2450
2430 /* try to insert new extent into found leaf and return */ 2451 /* try to insert new extent into found leaf and return */
2431 ext4_ext_store_pblock(&newex, newblock); 2452 ext4_ext_store_pblock(&newex, newblock);
2432 newex.ee_len = cpu_to_le16(allocated); 2453 newex.ee_len = cpu_to_le16(ar.len);
2433 if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ 2454 if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
2434 ext4_ext_mark_uninitialized(&newex); 2455 ext4_ext_mark_uninitialized(&newex);
2435 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2456 err = ext4_ext_insert_extent(handle, inode, path, &newex);
2436 if (err) { 2457 if (err) {
2437 /* free data blocks we just allocated */ 2458 /* free data blocks we just allocated */
2459 /* not a good idea to call discard here directly,
2460 * but otherwise we'd need to call it every free() */
2461 ext4_mb_discard_inode_preallocations(inode);
2438 ext4_free_blocks(handle, inode, ext_pblock(&newex), 2462 ext4_free_blocks(handle, inode, ext_pblock(&newex),
2439 le16_to_cpu(newex.ee_len)); 2463 le16_to_cpu(newex.ee_len), 0);
2440 goto out2; 2464 goto out2;
2441 } 2465 }
2442 2466
@@ -2445,6 +2469,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2445 2469
2446 /* previous routine could use block we allocated */ 2470 /* previous routine could use block we allocated */
2447 newblock = ext_pblock(&newex); 2471 newblock = ext_pblock(&newex);
2472 allocated = le16_to_cpu(newex.ee_len);
2448outnew: 2473outnew:
2449 __set_bit(BH_New, &bh_result->b_state); 2474 __set_bit(BH_New, &bh_result->b_state);
2450 2475
@@ -2496,6 +2521,8 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
2496 down_write(&EXT4_I(inode)->i_data_sem); 2521 down_write(&EXT4_I(inode)->i_data_sem);
2497 ext4_ext_invalidate_cache(inode); 2522 ext4_ext_invalidate_cache(inode);
2498 2523
2524 ext4_mb_discard_inode_preallocations(inode);
2525
2499 /* 2526 /*
2500 * TODO: optimization is possible here. 2527 * TODO: optimization is possible here.
2501 * Probably we need not scan at all, 2528 * Probably we need not scan at all,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a06a3b7cfc34..bb717cbb749c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -551,7 +551,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
551 return ret; 551 return ret;
552failed_out: 552failed_out:
553 for (i = 0; i <index; i++) 553 for (i = 0; i <index; i++)
554 ext4_free_blocks(handle, inode, new_blocks[i], 1); 554 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
555 return ret; 555 return ret;
556} 556}
557 557
@@ -650,9 +650,9 @@ failed:
650 ext4_journal_forget(handle, branch[i].bh); 650 ext4_journal_forget(handle, branch[i].bh);
651 } 651 }
652 for (i = 0; i <indirect_blks; i++) 652 for (i = 0; i <indirect_blks; i++)
653 ext4_free_blocks(handle, inode, new_blocks[i], 1); 653 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
654 654
655 ext4_free_blocks(handle, inode, new_blocks[i], num); 655 ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
656 656
657 return err; 657 return err;
658} 658}
@@ -749,9 +749,10 @@ err_out:
749 for (i = 1; i <= num; i++) { 749 for (i = 1; i <= num; i++) {
750 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 750 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
751 ext4_journal_forget(handle, where[i].bh); 751 ext4_journal_forget(handle, where[i].bh);
752 ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); 752 ext4_free_blocks(handle, inode,
753 le32_to_cpu(where[i-1].key), 1, 0);
753 } 754 }
754 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); 755 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
755 756
756 return err; 757 return err;
757} 758}
@@ -2052,7 +2053,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
2052 } 2053 }
2053 } 2054 }
2054 2055
2055 ext4_free_blocks(handle, inode, block_to_free, count); 2056 ext4_free_blocks(handle, inode, block_to_free, count, 0);
2056} 2057}
2057 2058
2058/** 2059/**
@@ -2225,7 +2226,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
2225 ext4_journal_test_restart(handle, inode); 2226 ext4_journal_test_restart(handle, inode);
2226 } 2227 }
2227 2228
2228 ext4_free_blocks(handle, inode, nr, 1); 2229 ext4_free_blocks(handle, inode, nr, 1, 1);
2229 2230
2230 if (parent_bh) { 2231 if (parent_bh) {
2231 /* 2232 /*
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
new file mode 100644
index 000000000000..76e5fedc0a0b
--- /dev/null
+++ b/fs/ext4/mballoc.c
@@ -0,0 +1,4552 @@
1/*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public Licens
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 */
18
19
20/*
21 * mballoc.c contains the multiblocks allocation routines
22 */
23
24#include <linux/time.h>
25#include <linux/fs.h>
26#include <linux/namei.h>
27#include <linux/ext4_jbd2.h>
28#include <linux/ext4_fs.h>
29#include <linux/quotaops.h>
30#include <linux/buffer_head.h>
31#include <linux/module.h>
32#include <linux/swap.h>
33#include <linux/proc_fs.h>
34#include <linux/pagemap.h>
35#include <linux/seq_file.h>
36#include <linux/version.h>
37#include "group.h"
38
39/*
40 * MUSTDO:
41 * - test ext4_ext_search_left() and ext4_ext_search_right()
42 * - search for metadata in few groups
43 *
44 * TODO v4:
45 * - normalization should take into account whether file is still open
46 * - discard preallocations if no free space left (policy?)
47 * - don't normalize tails
48 * - quota
49 * - reservation for superuser
50 *
51 * TODO v3:
52 * - bitmap read-ahead (proposed by Oleg Drokin aka green)
53 * - track min/max extents in each group for better group selection
54 * - mb_mark_used() may allocate chunk right after splitting buddy
55 * - tree of groups sorted by number of free blocks
56 * - error handling
57 */
58
59/*
60 * The allocation request involve request for multiple number of blocks
61 * near to the goal(block) value specified.
62 *
63 * During initialization phase of the allocator we decide to use the group
64 * preallocation or inode preallocation depending on the size file. The
65 * size of the file could be the resulting file size we would have after
66 * allocation or the current file size which ever is larger. If the size is
67 * less that sbi->s_mb_stream_request we select the group
68 * preallocation. The default value of s_mb_stream_request is 16
69 * blocks. This can also be tuned via
70 * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
71 * of number of blocks.
72 *
73 * The main motivation for having small file use group preallocation is to
74 * ensure that we have small file closer in the disk.
75 *
76 * First stage the allocator looks at the inode prealloc list
77 * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
78 * this particular inode. The inode prealloc space is represented as:
79 *
80 * pa_lstart -> the logical start block for this prealloc space
81 * pa_pstart -> the physical start block for this prealloc space
82 * pa_len -> lenght for this prealloc space
83 * pa_free -> free space available in this prealloc space
84 *
85 * The inode preallocation space is used looking at the _logical_ start
86 * block. If only the logical file block falls within the range of prealloc
87 * space we will consume the particular prealloc space. This make sure that
88 * that the we have contiguous physical blocks representing the file blocks
89 *
90 * The important thing to be noted in case of inode prealloc space is that
91 * we don't modify the values associated to inode prealloc space except
92 * pa_free.
93 *
94 * If we are not able to find blocks in the inode prealloc space and if we
95 * have the group allocation flag set then we look at the locality group
96 * prealloc space. These are per CPU prealloc list repreasented as
97 *
98 * ext4_sb_info.s_locality_groups[smp_processor_id()]
99 *
100 * The reason for having a per cpu locality group is to reduce the contention
101 * between CPUs. It is possible to get scheduled at this point.
102 *
103 * The locality group prealloc space is used looking at whether we have
104 * enough free space (pa_free) withing the prealloc space.
105 *
106 * If we can't allocate blocks via inode prealloc or/and locality group
107 * prealloc then we look at the buddy cache. The buddy cache is represented
108 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
109 * mapped to the buddy and bitmap information regarding different
110 * groups. The buddy information is attached to buddy cache inode so that
111 * we can access them through the page cache. The information regarding
112 * each group is loaded via ext4_mb_load_buddy. The information involve
113 * block bitmap and buddy information. The information are stored in the
114 * inode as:
115 *
116 * { page }
117 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
118 *
119 *
120 * one block each for bitmap and buddy information. So for each group we
121 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
122 * blocksize) blocks. So it can have information regarding groups_per_page
123 * which is blocks_per_page/2
124 *
125 * The buddy cache inode is not stored on disk. The inode is thrown
126 * away when the filesystem is unmounted.
127 *
128 * We look for count number of blocks in the buddy cache. If we were able
129 * to locate that many free blocks we return with additional information
130 * regarding rest of the contiguous physical block available
131 *
132 * Before allocating blocks via buddy cache we normalize the request
133 * blocks. This ensure we ask for more blocks that we needed. The extra
134 * blocks that we get after allocation is added to the respective prealloc
135 * list. In case of inode preallocation we follow a list of heuristics
136 * based on file size. This can be found in ext4_mb_normalize_request. If
137 * we are doing a group prealloc we try to normalize the request to
138 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
139 * 512 blocks. This can be tuned via
140 * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
141 * terms of number of blocks. If we have mounted the file system with -O
142 * stripe=<value> option the group prealloc request is normalized to the
143 * stripe value (sbi->s_stripe)
144 *
145 * The regular allocator(using the buddy cache) support few tunables.
146 *
147 * /proc/fs/ext4/<partition>/min_to_scan
148 * /proc/fs/ext4/<partition>/max_to_scan
149 * /proc/fs/ext4/<partition>/order2_req
150 *
151 * The regular allocator use buddy scan only if the request len is power of
152 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
153 * value of s_mb_order2_reqs can be tuned via
154 * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to
155 * stripe size (sbi->s_stripe), we try to search for contigous block in
156 * stripe size. This should result in better allocation on RAID setup. If
157 * not we search in the specific group using bitmap for best extents. The
158 * tunable min_to_scan and max_to_scan controll the behaviour here.
159 * min_to_scan indicate how long the mballoc __must__ look for a best
160 * extent and max_to_scanindicate how long the mballoc __can__ look for a
161 * best extent in the found extents. Searching for the blocks starts with
162 * the group specified as the goal value in allocation context via
163 * ac_g_ex. Each group is first checked based on the criteria whether it
164 * can used for allocation. ext4_mb_good_group explains how the groups are
165 * checked.
166 *
167 * Both the prealloc space are getting populated as above. So for the first
168 * request we will hit the buddy cache which will result in this prealloc
169 * space getting filled. The prealloc space is then later used for the
170 * subsequent request.
171 */
172
173/*
174 * mballoc operates on the following data:
175 * - on-disk bitmap
176 * - in-core buddy (actually includes buddy and bitmap)
177 * - preallocation descriptors (PAs)
178 *
179 * there are two types of preallocations:
180 * - inode
181 * assiged to specific inode and can be used for this inode only.
182 * it describes part of inode's space preallocated to specific
183 * physical blocks. any block from that preallocated can be used
184 * independent. the descriptor just tracks number of blocks left
185 * unused. so, before taking some block from descriptor, one must
186 * make sure corresponded logical block isn't allocated yet. this
187 * also means that freeing any block within descriptor's range
188 * must discard all preallocated blocks.
189 * - locality group
190 * assigned to specific locality group which does not translate to
191 * permanent set of inodes: inode can join and leave group. space
192 * from this type of preallocation can be used for any inode. thus
193 * it's consumed from the beginning to the end.
194 *
195 * relation between them can be expressed as:
196 * in-core buddy = on-disk bitmap + preallocation descriptors
197 *
198 * this mean blocks mballoc considers used are:
199 * - allocated blocks (persistent)
200 * - preallocated blocks (non-persistent)
201 *
202 * consistency in mballoc world means that at any time a block is either
203 * free or used in ALL structures. notice: "any time" should not be read
204 * literally -- time is discrete and delimited by locks.
205 *
206 * to keep it simple, we don't use block numbers, instead we count number of
207 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
208 *
209 * all operations can be expressed as:
210 * - init buddy: buddy = on-disk + PAs
211 * - new PA: buddy += N; PA = N
212 * - use inode PA: on-disk += N; PA -= N
213 * - discard inode PA buddy -= on-disk - PA; PA = 0
214 * - use locality group PA on-disk += N; PA -= N
215 * - discard locality group PA buddy -= PA; PA = 0
216 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
217 * is used in real operation because we can't know actual used
218 * bits from PA, only from on-disk bitmap
219 *
220 * if we follow this strict logic, then all operations above should be atomic.
221 * given some of them can block, we'd have to use something like semaphores
222 * killing performance on high-end SMP hardware. let's try to relax it using
223 * the following knowledge:
224 * 1) if buddy is referenced, it's already initialized
225 * 2) while block is used in buddy and the buddy is referenced,
226 * nobody can re-allocate that block
227 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
228 * bit set and PA claims same block, it's OK. IOW, one can set bit in
229 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded
230 * block
231 *
232 * so, now we're building a concurrency table:
233 * - init buddy vs.
234 * - new PA
235 * blocks for PA are allocated in the buddy, buddy must be referenced
236 * until PA is linked to allocation group to avoid concurrent buddy init
237 * - use inode PA
238 * we need to make sure that either on-disk bitmap or PA has uptodate data
239 * given (3) we care that PA-=N operation doesn't interfere with init
240 * - discard inode PA
241 * the simplest way would be to have buddy initialized by the discard
242 * - use locality group PA
243 * again PA-=N must be serialized with init
244 * - discard locality group PA
245 * the simplest way would be to have buddy initialized by the discard
246 * - new PA vs.
247 * - use inode PA
248 * i_data_sem serializes them
249 * - discard inode PA
250 * discard process must wait until PA isn't used by another process
251 * - use locality group PA
252 * some mutex should serialize them
253 * - discard locality group PA
254 * discard process must wait until PA isn't used by another process
255 * - use inode PA
256 * - use inode PA
257 * i_data_sem or another mutex should serializes them
258 * - discard inode PA
259 * discard process must wait until PA isn't used by another process
260 * - use locality group PA
261 * nothing wrong here -- they're different PAs covering different blocks
262 * - discard locality group PA
263 * discard process must wait until PA isn't used by another process
264 *
265 * now we're ready to make few consequences:
266 * - PA is referenced and while it is no discard is possible
267 * - PA is referenced until block isn't marked in on-disk bitmap
268 * - PA changes only after on-disk bitmap
269 * - discard must not compete with init. either init is done before
270 * any discard or they're serialized somehow
271 * - buddy init as sum of on-disk bitmap and PAs is done atomically
272 *
273 * a special case when we've used PA to emptiness. no need to modify buddy
274 * in this case, but we should care about concurrent init
275 *
276 */
277
278 /*
279 * Logic in few words:
280 *
281 * - allocation:
282 * load group
283 * find blocks
284 * mark bits in on-disk bitmap
285 * release group
286 *
287 * - use preallocation:
288 * find proper PA (per-inode or group)
289 * load group
290 * mark bits in on-disk bitmap
291 * release group
292 * release PA
293 *
294 * - free:
295 * load group
296 * mark bits in on-disk bitmap
297 * release group
298 *
299 * - discard preallocations in group:
300 * mark PAs deleted
301 * move them onto local list
302 * load on-disk bitmap
303 * load group
304 * remove PA from object (inode or locality group)
305 * mark free blocks in-core
306 *
307 * - discard inode's preallocations:
308 */
309
310/*
311 * Locking rules
312 *
313 * Locks:
314 * - bitlock on a group (group)
315 * - object (inode/locality) (object)
316 * - per-pa lock (pa)
317 *
318 * Paths:
319 * - new pa
320 * object
321 * group
322 *
323 * - find and use pa:
324 * pa
325 *
326 * - release consumed pa:
327 * pa
328 * group
329 * object
330 *
331 * - generate in-core bitmap:
332 * group
333 * pa
334 *
335 * - discard all for given object (inode, locality group):
336 * object
337 * pa
338 * group
339 *
340 * - discard all for given group:
341 * group
342 * pa
343 * group
344 * object
345 *
346 */
347
348/*
349 * with AGGRESSIVE_CHECK allocator runs consistency checks over
350 * structures. these checks slow things down a lot
351 */
352#define AGGRESSIVE_CHECK__
353
354/*
355 * with DOUBLE_CHECK defined mballoc creates persistent in-core
356 * bitmaps, maintains and uses them to check for double allocations
357 */
358#define DOUBLE_CHECK__
359
360/*
361 */
362#define MB_DEBUG__
363#ifdef MB_DEBUG
364#define mb_debug(fmt, a...) printk(fmt, ##a)
365#else
366#define mb_debug(fmt, a...)
367#endif
368
369/*
370 * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
371 * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
372 */
373#define EXT4_MB_HISTORY
374#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
375#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
376#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
377#define EXT4_MB_HISTORY_FREE 8 /* free */
378
379#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
380 EXT4_MB_HISTORY_PREALLOC)
381
382/*
383 * How long mballoc can look for a best extent (in found extents)
384 */
385#define MB_DEFAULT_MAX_TO_SCAN 200
386
387/*
388 * How long mballoc must look for a best extent
389 */
390#define MB_DEFAULT_MIN_TO_SCAN 10
391
392/*
393 * How many groups mballoc will scan looking for the best chunk
394 */
395#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5
396
397/*
398 * with 'ext4_mb_stats' allocator will collect stats that will be
399 * shown at umount. The collecting costs though!
400 */
401#define MB_DEFAULT_STATS 1
402
403/*
404 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
405 * by the stream allocator, which purpose is to pack requests
406 * as close each to other as possible to produce smooth I/O traffic
407 * We use locality group prealloc space for stream request.
408 * We can tune the same via /proc/fs/ext4/<parition>/stream_req
409 */
410#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */
411
412/*
413 * for which requests use 2^N search using buddies
414 */
415#define MB_DEFAULT_ORDER2_REQS 2
416
417/*
418 * default group prealloc size 512 blocks
419 */
420#define MB_DEFAULT_GROUP_PREALLOC 512
421
422static struct kmem_cache *ext4_pspace_cachep;
423
424#ifdef EXT4_BB_MAX_BLOCKS
425#undef EXT4_BB_MAX_BLOCKS
426#endif
427#define EXT4_BB_MAX_BLOCKS 30
428
429struct ext4_free_metadata {
430 ext4_group_t group;
431 unsigned short num;
432 ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
433 struct list_head list;
434};
435
436struct ext4_group_info {
437 unsigned long bb_state;
438 unsigned long bb_tid;
439 struct ext4_free_metadata *bb_md_cur;
440 unsigned short bb_first_free;
441 unsigned short bb_free;
442 unsigned short bb_fragments;
443 struct list_head bb_prealloc_list;
444#ifdef DOUBLE_CHECK
445 void *bb_bitmap;
446#endif
447 unsigned short bb_counters[];
448};
449
450#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
451#define EXT4_GROUP_INFO_LOCKED_BIT 1
452
453#define EXT4_MB_GRP_NEED_INIT(grp) \
454 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
455
456
457struct ext4_prealloc_space {
458 struct list_head pa_inode_list;
459 struct list_head pa_group_list;
460 union {
461 struct list_head pa_tmp_list;
462 struct rcu_head pa_rcu;
463 } u;
464 spinlock_t pa_lock;
465 atomic_t pa_count;
466 unsigned pa_deleted;
467 ext4_fsblk_t pa_pstart; /* phys. block */
468 ext4_lblk_t pa_lstart; /* log. block */
469 unsigned short pa_len; /* len of preallocated chunk */
470 unsigned short pa_free; /* how many blocks are free */
471 unsigned short pa_linear; /* consumed in one direction
472 * strictly, for grp prealloc */
473 spinlock_t *pa_obj_lock;
474 struct inode *pa_inode; /* hack, for history only */
475};
476
477
478struct ext4_free_extent {
479 ext4_lblk_t fe_logical;
480 ext4_grpblk_t fe_start;
481 ext4_group_t fe_group;
482 int fe_len;
483};
484
485/*
486 * Locality group:
487 * we try to group all related changes together
488 * so that writeback can flush/allocate them together as well
489 */
490struct ext4_locality_group {
491 /* for allocator */
492 struct mutex lg_mutex; /* to serialize allocates */
493 struct list_head lg_prealloc_list;/* list of preallocations */
494 spinlock_t lg_prealloc_lock;
495};
496
497struct ext4_allocation_context {
498 struct inode *ac_inode;
499 struct super_block *ac_sb;
500
501 /* original request */
502 struct ext4_free_extent ac_o_ex;
503
504 /* goal request (after normalization) */
505 struct ext4_free_extent ac_g_ex;
506
507 /* the best found extent */
508 struct ext4_free_extent ac_b_ex;
509
510 /* copy of the bext found extent taken before preallocation efforts */
511 struct ext4_free_extent ac_f_ex;
512
513 /* number of iterations done. we have to track to limit searching */
514 unsigned long ac_ex_scanned;
515 __u16 ac_groups_scanned;
516 __u16 ac_found;
517 __u16 ac_tail;
518 __u16 ac_buddy;
519 __u16 ac_flags; /* allocation hints */
520 __u8 ac_status;
521 __u8 ac_criteria;
522 __u8 ac_repeats;
523 __u8 ac_2order; /* if request is to allocate 2^N blocks and
524 * N > 0, the field stores N, otherwise 0 */
525 __u8 ac_op; /* operation, for history only */
526 struct page *ac_bitmap_page;
527 struct page *ac_buddy_page;
528 struct ext4_prealloc_space *ac_pa;
529 struct ext4_locality_group *ac_lg;
530};
531
532#define AC_STATUS_CONTINUE 1
533#define AC_STATUS_FOUND 2
534#define AC_STATUS_BREAK 3
535
536struct ext4_mb_history {
537 struct ext4_free_extent orig; /* orig allocation */
538 struct ext4_free_extent goal; /* goal allocation */
539 struct ext4_free_extent result; /* result allocation */
540 unsigned pid;
541 unsigned ino;
542 __u16 found; /* how many extents have been found */
543 __u16 groups; /* how many groups have been scanned */
544 __u16 tail; /* what tail broke some buddy */
545 __u16 buddy; /* buddy the tail ^^^ broke */
546 __u16 flags;
547 __u8 cr:3; /* which phase the result extent was found at */
548 __u8 op:4;
549 __u8 merged:1;
550};
551
552struct ext4_buddy {
553 struct page *bd_buddy_page;
554 void *bd_buddy;
555 struct page *bd_bitmap_page;
556 void *bd_bitmap;
557 struct ext4_group_info *bd_info;
558 struct super_block *bd_sb;
559 __u16 bd_blkbits;
560 ext4_group_t bd_group;
561};
562#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
563#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
564
565#ifndef EXT4_MB_HISTORY
566static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
567{
568 return;
569}
570#else
571static void ext4_mb_store_history(struct ext4_allocation_context *ac);
572#endif
573
574#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
575
576static struct proc_dir_entry *proc_root_ext4;
577struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
578ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
579 ext4_fsblk_t goal, unsigned long *count, int *errp);
580
581static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
582 ext4_group_t group);
583static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
584static void ext4_mb_free_committed_blocks(struct super_block *);
585static void ext4_mb_return_to_preallocation(struct inode *inode,
586 struct ext4_buddy *e4b, sector_t block,
587 int count);
588static void ext4_mb_put_pa(struct ext4_allocation_context *,
589 struct super_block *, struct ext4_prealloc_space *pa);
590static int ext4_mb_init_per_dev_proc(struct super_block *sb);
591static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
592
593
594static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
595{
596 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
597
598 bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
599}
600
601static inline void ext4_unlock_group(struct super_block *sb,
602 ext4_group_t group)
603{
604 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
605
606 bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
607}
608
609static inline int ext4_is_group_locked(struct super_block *sb,
610 ext4_group_t group)
611{
612 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
613
614 return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
615 &(grinfo->bb_state));
616}
617
618static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
619 struct ext4_free_extent *fex)
620{
621 ext4_fsblk_t block;
622
623 block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
624 + fex->fe_start
625 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
626 return block;
627}
628
629#if BITS_PER_LONG == 64
630#define mb_correct_addr_and_bit(bit, addr) \
631{ \
632 bit += ((unsigned long) addr & 7UL) << 3; \
633 addr = (void *) ((unsigned long) addr & ~7UL); \
634}
635#elif BITS_PER_LONG == 32
636#define mb_correct_addr_and_bit(bit, addr) \
637{ \
638 bit += ((unsigned long) addr & 3UL) << 3; \
639 addr = (void *) ((unsigned long) addr & ~3UL); \
640}
641#else
642#error "how many bits you are?!"
643#endif
644
645static inline int mb_test_bit(int bit, void *addr)
646{
647 /*
648 * ext4_test_bit on architecture like powerpc
649 * needs unsigned long aligned address
650 */
651 mb_correct_addr_and_bit(bit, addr);
652 return ext4_test_bit(bit, addr);
653}
654
655static inline void mb_set_bit(int bit, void *addr)
656{
657 mb_correct_addr_and_bit(bit, addr);
658 ext4_set_bit(bit, addr);
659}
660
661static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
662{
663 mb_correct_addr_and_bit(bit, addr);
664 ext4_set_bit_atomic(lock, bit, addr);
665}
666
667static inline void mb_clear_bit(int bit, void *addr)
668{
669 mb_correct_addr_and_bit(bit, addr);
670 ext4_clear_bit(bit, addr);
671}
672
673static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
674{
675 mb_correct_addr_and_bit(bit, addr);
676 ext4_clear_bit_atomic(lock, bit, addr);
677}
678
679static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
680{
681 char *bb;
682
683 /* FIXME!! is this needed */
684 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
685 BUG_ON(max == NULL);
686
687 if (order > e4b->bd_blkbits + 1) {
688 *max = 0;
689 return NULL;
690 }
691
692 /* at order 0 we see each particular block */
693 *max = 1 << (e4b->bd_blkbits + 3);
694 if (order == 0)
695 return EXT4_MB_BITMAP(e4b);
696
697 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
698 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
699
700 return bb;
701}
702
703#ifdef DOUBLE_CHECK
704static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
705 int first, int count)
706{
707 int i;
708 struct super_block *sb = e4b->bd_sb;
709
710 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
711 return;
712 BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
713 for (i = 0; i < count; i++) {
714 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
715 ext4_fsblk_t blocknr;
716 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
717 blocknr += first + i;
718 blocknr +=
719 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
720
721 ext4_error(sb, __FUNCTION__, "double-free of inode"
722 " %lu's block %llu(bit %u in group %lu)\n",
723 inode ? inode->i_ino : 0, blocknr,
724 first + i, e4b->bd_group);
725 }
726 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
727 }
728}
729
730static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
731{
732 int i;
733
734 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
735 return;
736 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
737 for (i = 0; i < count; i++) {
738 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
739 mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
740 }
741}
742
743static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
744{
745 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
746 unsigned char *b1, *b2;
747 int i;
748 b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
749 b2 = (unsigned char *) bitmap;
750 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
751 if (b1[i] != b2[i]) {
752 printk("corruption in group %lu at byte %u(%u):"
753 " %x in copy != %x on disk/prealloc\n",
754 e4b->bd_group, i, i * 8, b1[i], b2[i]);
755 BUG();
756 }
757 }
758 }
759}
760
761#else
762static inline void mb_free_blocks_double(struct inode *inode,
763 struct ext4_buddy *e4b, int first, int count)
764{
765 return;
766}
767static inline void mb_mark_used_double(struct ext4_buddy *e4b,
768 int first, int count)
769{
770 return;
771}
772static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
773{
774 return;
775}
776#endif
777
778#ifdef AGGRESSIVE_CHECK
779
780#define MB_CHECK_ASSERT(assert) \
781do { \
782 if (!(assert)) { \
783 printk(KERN_EMERG \
784 "Assertion failure in %s() at %s:%d: \"%s\"\n", \
785 function, file, line, # assert); \
786 BUG(); \
787 } \
788} while (0)
789
790static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
791 const char *function, int line)
792{
793 struct super_block *sb = e4b->bd_sb;
794 int order = e4b->bd_blkbits + 1;
795 int max;
796 int max2;
797 int i;
798 int j;
799 int k;
800 int count;
801 struct ext4_group_info *grp;
802 int fragments = 0;
803 int fstart;
804 struct list_head *cur;
805 void *buddy;
806 void *buddy2;
807
808 if (!test_opt(sb, MBALLOC))
809 return 0;
810
811 {
812 static int mb_check_counter;
813 if (mb_check_counter++ % 100 != 0)
814 return 0;
815 }
816
817 while (order > 1) {
818 buddy = mb_find_buddy(e4b, order, &max);
819 MB_CHECK_ASSERT(buddy);
820 buddy2 = mb_find_buddy(e4b, order - 1, &max2);
821 MB_CHECK_ASSERT(buddy2);
822 MB_CHECK_ASSERT(buddy != buddy2);
823 MB_CHECK_ASSERT(max * 2 == max2);
824
825 count = 0;
826 for (i = 0; i < max; i++) {
827
828 if (mb_test_bit(i, buddy)) {
829 /* only single bit in buddy2 may be 1 */
830 if (!mb_test_bit(i << 1, buddy2)) {
831 MB_CHECK_ASSERT(
832 mb_test_bit((i<<1)+1, buddy2));
833 } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
834 MB_CHECK_ASSERT(
835 mb_test_bit(i << 1, buddy2));
836 }
837 continue;
838 }
839
840 /* both bits in buddy2 must be 0 */
841 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
842 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
843
844 for (j = 0; j < (1 << order); j++) {
845 k = (i * (1 << order)) + j;
846 MB_CHECK_ASSERT(
847 !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
848 }
849 count++;
850 }
851 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
852 order--;
853 }
854
855 fstart = -1;
856 buddy = mb_find_buddy(e4b, 0, &max);
857 for (i = 0; i < max; i++) {
858 if (!mb_test_bit(i, buddy)) {
859 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
860 if (fstart == -1) {
861 fragments++;
862 fstart = i;
863 }
864 continue;
865 }
866 fstart = -1;
867 /* check used bits only */
868 for (j = 0; j < e4b->bd_blkbits + 1; j++) {
869 buddy2 = mb_find_buddy(e4b, j, &max2);
870 k = i >> j;
871 MB_CHECK_ASSERT(k < max2);
872 MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
873 }
874 }
875 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
876 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
877
878 grp = ext4_get_group_info(sb, e4b->bd_group);
879 buddy = mb_find_buddy(e4b, 0, &max);
880 list_for_each(cur, &grp->bb_prealloc_list) {
881 ext4_group_t groupnr;
882 struct ext4_prealloc_space *pa;
883 pa = list_entry(cur, struct ext4_prealloc_space, group_list);
884 ext4_get_group_no_and_offset(sb, pa->pstart, &groupnr, &k);
885 MB_CHECK_ASSERT(groupnr == e4b->bd_group);
886 for (i = 0; i < pa->len; i++)
887 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
888 }
889 return 0;
890}
891#undef MB_CHECK_ASSERT
892#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
893 __FILE__, __FUNCTION__, __LINE__)
894#else
895#define mb_check_buddy(e4b)
896#endif
897
898/* FIXME!! need more doc */
899static void ext4_mb_mark_free_simple(struct super_block *sb,
900 void *buddy, unsigned first, int len,
901 struct ext4_group_info *grp)
902{
903 struct ext4_sb_info *sbi = EXT4_SB(sb);
904 unsigned short min;
905 unsigned short max;
906 unsigned short chunk;
907 unsigned short border;
908
909 BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb));
910
911 border = 2 << sb->s_blocksize_bits;
912
913 while (len > 0) {
914 /* find how many blocks can be covered since this position */
915 max = ffs(first | border) - 1;
916
917 /* find how many blocks of power 2 we need to mark */
918 min = fls(len) - 1;
919
920 if (max < min)
921 min = max;
922 chunk = 1 << min;
923
924 /* mark multiblock chunks only */
925 grp->bb_counters[min]++;
926 if (min > 0)
927 mb_clear_bit(first >> min,
928 buddy + sbi->s_mb_offsets[min]);
929
930 len -= chunk;
931 first += chunk;
932 }
933}
934
935static void ext4_mb_generate_buddy(struct super_block *sb,
936 void *buddy, void *bitmap, ext4_group_t group)
937{
938 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
939 unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
940 unsigned short i = 0;
941 unsigned short first;
942 unsigned short len;
943 unsigned free = 0;
944 unsigned fragments = 0;
945 unsigned long long period = get_cycles();
946
947 /* initialize buddy from bitmap which is aggregation
948 * of on-disk bitmap and preallocations */
949 i = ext4_find_next_zero_bit(bitmap, max, 0);
950 grp->bb_first_free = i;
951 while (i < max) {
952 fragments++;
953 first = i;
954 i = ext4_find_next_bit(bitmap, max, i);
955 len = i - first;
956 free += len;
957 if (len > 1)
958 ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
959 else
960 grp->bb_counters[0]++;
961 if (i < max)
962 i = ext4_find_next_zero_bit(bitmap, max, i);
963 }
964 grp->bb_fragments = fragments;
965
966 if (free != grp->bb_free) {
967 printk(KERN_DEBUG
968 "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
969 group, free, grp->bb_free);
970 grp->bb_free = free;
971 }
972
973 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
974
975 period = get_cycles() - period;
976 spin_lock(&EXT4_SB(sb)->s_bal_lock);
977 EXT4_SB(sb)->s_mb_buddies_generated++;
978 EXT4_SB(sb)->s_mb_generation_time += period;
979 spin_unlock(&EXT4_SB(sb)->s_bal_lock);
980}
981
982/* The buddy information is attached the buddy cache inode
983 * for convenience. The information regarding each group
984 * is loaded via ext4_mb_load_buddy. The information involve
985 * block bitmap and buddy information. The information are
986 * stored in the inode as
987 *
988 * { page }
989 * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
990 *
991 *
992 * one block each for bitmap and buddy information.
993 * So for each group we take up 2 blocks. A page can
994 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
995 * So it can have information regarding groups_per_page which
996 * is blocks_per_page/2
997 */
998
999static int ext4_mb_init_cache(struct page *page, char *incore)
1000{
1001 int blocksize;
1002 int blocks_per_page;
1003 int groups_per_page;
1004 int err = 0;
1005 int i;
1006 ext4_group_t first_group;
1007 int first_block;
1008 struct super_block *sb;
1009 struct buffer_head *bhs;
1010 struct buffer_head **bh;
1011 struct inode *inode;
1012 char *data;
1013 char *bitmap;
1014
1015 mb_debug("init page %lu\n", page->index);
1016
1017 inode = page->mapping->host;
1018 sb = inode->i_sb;
1019 blocksize = 1 << inode->i_blkbits;
1020 blocks_per_page = PAGE_CACHE_SIZE / blocksize;
1021
1022 groups_per_page = blocks_per_page >> 1;
1023 if (groups_per_page == 0)
1024 groups_per_page = 1;
1025
1026 /* allocate buffer_heads to read bitmaps */
1027 if (groups_per_page > 1) {
1028 err = -ENOMEM;
1029 i = sizeof(struct buffer_head *) * groups_per_page;
1030 bh = kzalloc(i, GFP_NOFS);
1031 if (bh == NULL)
1032 goto out;
1033 } else
1034 bh = &bhs;
1035
1036 first_group = page->index * blocks_per_page / 2;
1037
1038 /* read all groups the page covers into the cache */
1039 for (i = 0; i < groups_per_page; i++) {
1040 struct ext4_group_desc *desc;
1041
1042 if (first_group + i >= EXT4_SB(sb)->s_groups_count)
1043 break;
1044
1045 err = -EIO;
1046 desc = ext4_get_group_desc(sb, first_group + i, NULL);
1047 if (desc == NULL)
1048 goto out;
1049
1050 err = -ENOMEM;
1051 bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
1052 if (bh[i] == NULL)
1053 goto out;
1054
1055 if (bh_uptodate_or_lock(bh[i]))
1056 continue;
1057
1058 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1059 ext4_init_block_bitmap(sb, bh[i],
1060 first_group + i, desc);
1061 set_buffer_uptodate(bh[i]);
1062 unlock_buffer(bh[i]);
1063 continue;
1064 }
1065 get_bh(bh[i]);
1066 bh[i]->b_end_io = end_buffer_read_sync;
1067 submit_bh(READ, bh[i]);
1068 mb_debug("read bitmap for group %lu\n", first_group + i);
1069 }
1070
1071 /* wait for I/O completion */
1072 for (i = 0; i < groups_per_page && bh[i]; i++)
1073 wait_on_buffer(bh[i]);
1074
1075 err = -EIO;
1076 for (i = 0; i < groups_per_page && bh[i]; i++)
1077 if (!buffer_uptodate(bh[i]))
1078 goto out;
1079
1080 first_block = page->index * blocks_per_page;
1081 for (i = 0; i < blocks_per_page; i++) {
1082 int group;
1083 struct ext4_group_info *grinfo;
1084
1085 group = (first_block + i) >> 1;
1086 if (group >= EXT4_SB(sb)->s_groups_count)
1087 break;
1088
1089 /*
1090 * data carry information regarding this
1091 * particular group in the format specified
1092 * above
1093 *
1094 */
1095 data = page_address(page) + (i * blocksize);
1096 bitmap = bh[group - first_group]->b_data;
1097
1098 /*
1099 * We place the buddy block and bitmap block
1100 * close together
1101 */
1102 if ((first_block + i) & 1) {
1103 /* this is block of buddy */
1104 BUG_ON(incore == NULL);
1105 mb_debug("put buddy for group %u in page %lu/%x\n",
1106 group, page->index, i * blocksize);
1107 memset(data, 0xff, blocksize);
1108 grinfo = ext4_get_group_info(sb, group);
1109 grinfo->bb_fragments = 0;
1110 memset(grinfo->bb_counters, 0,
1111 sizeof(unsigned short)*(sb->s_blocksize_bits+2));
1112 /*
1113 * incore got set to the group block bitmap below
1114 */
1115 ext4_mb_generate_buddy(sb, data, incore, group);
1116 incore = NULL;
1117 } else {
1118 /* this is block of bitmap */
1119 BUG_ON(incore != NULL);
1120 mb_debug("put bitmap for group %u in page %lu/%x\n",
1121 group, page->index, i * blocksize);
1122
1123 /* see comments in ext4_mb_put_pa() */
1124 ext4_lock_group(sb, group);
1125 memcpy(data, bitmap, blocksize);
1126
1127 /* mark all preallocated blks used in in-core bitmap */
1128 ext4_mb_generate_from_pa(sb, data, group);
1129 ext4_unlock_group(sb, group);
1130
1131 /* set incore so that the buddy information can be
1132 * generated using this
1133 */
1134 incore = data;
1135 }
1136 }
1137 SetPageUptodate(page);
1138
1139out:
1140 if (bh) {
1141 for (i = 0; i < groups_per_page && bh[i]; i++)
1142 brelse(bh[i]);
1143 if (bh != &bhs)
1144 kfree(bh);
1145 }
1146 return err;
1147}
1148
1149static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1150 struct ext4_buddy *e4b)
1151{
1152 struct ext4_sb_info *sbi = EXT4_SB(sb);
1153 struct inode *inode = sbi->s_buddy_cache;
1154 int blocks_per_page;
1155 int block;
1156 int pnum;
1157 int poff;
1158 struct page *page;
1159
1160 mb_debug("load group %lu\n", group);
1161
1162 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1163
1164 e4b->bd_blkbits = sb->s_blocksize_bits;
1165 e4b->bd_info = ext4_get_group_info(sb, group);
1166 e4b->bd_sb = sb;
1167 e4b->bd_group = group;
1168 e4b->bd_buddy_page = NULL;
1169 e4b->bd_bitmap_page = NULL;
1170
1171 /*
1172 * the buddy cache inode stores the block bitmap
1173 * and buddy information in consecutive blocks.
1174 * So for each group we need two blocks.
1175 */
1176 block = group * 2;
1177 pnum = block / blocks_per_page;
1178 poff = block % blocks_per_page;
1179
1180 /* we could use find_or_create_page(), but it locks page
1181 * what we'd like to avoid in fast path ... */
1182 page = find_get_page(inode->i_mapping, pnum);
1183 if (page == NULL || !PageUptodate(page)) {
1184 if (page)
1185 page_cache_release(page);
1186 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1187 if (page) {
1188 BUG_ON(page->mapping != inode->i_mapping);
1189 if (!PageUptodate(page)) {
1190 ext4_mb_init_cache(page, NULL);
1191 mb_cmp_bitmaps(e4b, page_address(page) +
1192 (poff * sb->s_blocksize));
1193 }
1194 unlock_page(page);
1195 }
1196 }
1197 if (page == NULL || !PageUptodate(page))
1198 goto err;
1199 e4b->bd_bitmap_page = page;
1200 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1201 mark_page_accessed(page);
1202
1203 block++;
1204 pnum = block / blocks_per_page;
1205 poff = block % blocks_per_page;
1206
1207 page = find_get_page(inode->i_mapping, pnum);
1208 if (page == NULL || !PageUptodate(page)) {
1209 if (page)
1210 page_cache_release(page);
1211 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1212 if (page) {
1213 BUG_ON(page->mapping != inode->i_mapping);
1214 if (!PageUptodate(page))
1215 ext4_mb_init_cache(page, e4b->bd_bitmap);
1216
1217 unlock_page(page);
1218 }
1219 }
1220 if (page == NULL || !PageUptodate(page))
1221 goto err;
1222 e4b->bd_buddy_page = page;
1223 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1224 mark_page_accessed(page);
1225
1226 BUG_ON(e4b->bd_bitmap_page == NULL);
1227 BUG_ON(e4b->bd_buddy_page == NULL);
1228
1229 return 0;
1230
1231err:
1232 if (e4b->bd_bitmap_page)
1233 page_cache_release(e4b->bd_bitmap_page);
1234 if (e4b->bd_buddy_page)
1235 page_cache_release(e4b->bd_buddy_page);
1236 e4b->bd_buddy = NULL;
1237 e4b->bd_bitmap = NULL;
1238 return -EIO;
1239}
1240
1241static void ext4_mb_release_desc(struct ext4_buddy *e4b)
1242{
1243 if (e4b->bd_bitmap_page)
1244 page_cache_release(e4b->bd_bitmap_page);
1245 if (e4b->bd_buddy_page)
1246 page_cache_release(e4b->bd_buddy_page);
1247}
1248
1249
1250static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1251{
1252 int order = 1;
1253 void *bb;
1254
1255 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
1256 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1257
1258 bb = EXT4_MB_BUDDY(e4b);
1259 while (order <= e4b->bd_blkbits + 1) {
1260 block = block >> 1;
1261 if (!mb_test_bit(block, bb)) {
1262 /* this block is part of buddy of order 'order' */
1263 return order;
1264 }
1265 bb += 1 << (e4b->bd_blkbits - order);
1266 order++;
1267 }
1268 return 0;
1269}
1270
1271static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
1272{
1273 __u32 *addr;
1274
1275 len = cur + len;
1276 while (cur < len) {
1277 if ((cur & 31) == 0 && (len - cur) >= 32) {
1278 /* fast path: clear whole word at once */
1279 addr = bm + (cur >> 3);
1280 *addr = 0;
1281 cur += 32;
1282 continue;
1283 }
1284 mb_clear_bit_atomic(lock, cur, bm);
1285 cur++;
1286 }
1287}
1288
1289static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1290{
1291 __u32 *addr;
1292
1293 len = cur + len;
1294 while (cur < len) {
1295 if ((cur & 31) == 0 && (len - cur) >= 32) {
1296 /* fast path: set whole word at once */
1297 addr = bm + (cur >> 3);
1298 *addr = 0xffffffff;
1299 cur += 32;
1300 continue;
1301 }
1302 mb_set_bit_atomic(lock, cur, bm);
1303 cur++;
1304 }
1305}
1306
1307static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1308 int first, int count)
1309{
1310 int block = 0;
1311 int max = 0;
1312 int order;
1313 void *buddy;
1314 void *buddy2;
1315 struct super_block *sb = e4b->bd_sb;
1316
1317 BUG_ON(first + count > (sb->s_blocksize << 3));
1318 BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
1319 mb_check_buddy(e4b);
1320 mb_free_blocks_double(inode, e4b, first, count);
1321
1322 e4b->bd_info->bb_free += count;
1323 if (first < e4b->bd_info->bb_first_free)
1324 e4b->bd_info->bb_first_free = first;
1325
1326 /* let's maintain fragments counter */
1327 if (first != 0)
1328 block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
1329 if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
1330 max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
1331 if (block && max)
1332 e4b->bd_info->bb_fragments--;
1333 else if (!block && !max)
1334 e4b->bd_info->bb_fragments++;
1335
1336 /* let's maintain buddy itself */
1337 while (count-- > 0) {
1338 block = first++;
1339 order = 0;
1340
1341 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
1342 ext4_fsblk_t blocknr;
1343 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
1344 blocknr += block;
1345 blocknr +=
1346 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1347
1348 ext4_error(sb, __FUNCTION__, "double-free of inode"
1349 " %lu's block %llu(bit %u in group %lu)\n",
1350 inode ? inode->i_ino : 0, blocknr, block,
1351 e4b->bd_group);
1352 }
1353 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1354 e4b->bd_info->bb_counters[order]++;
1355
1356 /* start of the buddy */
1357 buddy = mb_find_buddy(e4b, order, &max);
1358
1359 do {
1360 block &= ~1UL;
1361 if (mb_test_bit(block, buddy) ||
1362 mb_test_bit(block + 1, buddy))
1363 break;
1364
1365 /* both the buddies are free, try to coalesce them */
1366 buddy2 = mb_find_buddy(e4b, order + 1, &max);
1367
1368 if (!buddy2)
1369 break;
1370
1371 if (order > 0) {
1372 /* for special purposes, we don't set
1373 * free bits in bitmap */
1374 mb_set_bit(block, buddy);
1375 mb_set_bit(block + 1, buddy);
1376 }
1377 e4b->bd_info->bb_counters[order]--;
1378 e4b->bd_info->bb_counters[order]--;
1379
1380 block = block >> 1;
1381 order++;
1382 e4b->bd_info->bb_counters[order]++;
1383
1384 mb_clear_bit(block, buddy2);
1385 buddy = buddy2;
1386 } while (1);
1387 }
1388 mb_check_buddy(e4b);
1389
1390 return 0;
1391}
1392
1393static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1394 int needed, struct ext4_free_extent *ex)
1395{
1396 int next = block;
1397 int max;
1398 int ord;
1399 void *buddy;
1400
1401 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
1402 BUG_ON(ex == NULL);
1403
1404 buddy = mb_find_buddy(e4b, order, &max);
1405 BUG_ON(buddy == NULL);
1406 BUG_ON(block >= max);
1407 if (mb_test_bit(block, buddy)) {
1408 ex->fe_len = 0;
1409 ex->fe_start = 0;
1410 ex->fe_group = 0;
1411 return 0;
1412 }
1413
1414 /* FIXME dorp order completely ? */
1415 if (likely(order == 0)) {
1416 /* find actual order */
1417 order = mb_find_order_for_block(e4b, block);
1418 block = block >> order;
1419 }
1420
1421 ex->fe_len = 1 << order;
1422 ex->fe_start = block << order;
1423 ex->fe_group = e4b->bd_group;
1424
1425 /* calc difference from given start */
1426 next = next - ex->fe_start;
1427 ex->fe_len -= next;
1428 ex->fe_start += next;
1429
1430 while (needed > ex->fe_len &&
1431 (buddy = mb_find_buddy(e4b, order, &max))) {
1432
1433 if (block + 1 >= max)
1434 break;
1435
1436 next = (block + 1) * (1 << order);
1437 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
1438 break;
1439
1440 ord = mb_find_order_for_block(e4b, next);
1441
1442 order = ord;
1443 block = next >> order;
1444 ex->fe_len += 1 << order;
1445 }
1446
1447 BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
1448 return ex->fe_len;
1449}
1450
1451static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1452{
1453 int ord;
1454 int mlen = 0;
1455 int max = 0;
1456 int cur;
1457 int start = ex->fe_start;
1458 int len = ex->fe_len;
1459 unsigned ret = 0;
1460 int len0 = len;
1461 void *buddy;
1462
1463 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1464 BUG_ON(e4b->bd_group != ex->fe_group);
1465 BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
1466 mb_check_buddy(e4b);
1467 mb_mark_used_double(e4b, start, len);
1468
1469 e4b->bd_info->bb_free -= len;
1470 if (e4b->bd_info->bb_first_free == start)
1471 e4b->bd_info->bb_first_free += len;
1472
1473 /* let's maintain fragments counter */
1474 if (start != 0)
1475 mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
1476 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1477 max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
1478 if (mlen && max)
1479 e4b->bd_info->bb_fragments++;
1480 else if (!mlen && !max)
1481 e4b->bd_info->bb_fragments--;
1482
1483 /* let's maintain buddy itself */
1484 while (len) {
1485 ord = mb_find_order_for_block(e4b, start);
1486
1487 if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1488 /* the whole chunk may be allocated at once! */
1489 mlen = 1 << ord;
1490 buddy = mb_find_buddy(e4b, ord, &max);
1491 BUG_ON((start >> ord) >= max);
1492 mb_set_bit(start >> ord, buddy);
1493 e4b->bd_info->bb_counters[ord]--;
1494 start += mlen;
1495 len -= mlen;
1496 BUG_ON(len < 0);
1497 continue;
1498 }
1499
1500 /* store for history */
1501 if (ret == 0)
1502 ret = len | (ord << 16);
1503
1504 /* we have to split large buddy */
1505 BUG_ON(ord <= 0);
1506 buddy = mb_find_buddy(e4b, ord, &max);
1507 mb_set_bit(start >> ord, buddy);
1508 e4b->bd_info->bb_counters[ord]--;
1509
1510 ord--;
1511 cur = (start >> ord) & ~1U;
1512 buddy = mb_find_buddy(e4b, ord, &max);
1513 mb_clear_bit(cur, buddy);
1514 mb_clear_bit(cur + 1, buddy);
1515 e4b->bd_info->bb_counters[ord]++;
1516 e4b->bd_info->bb_counters[ord]++;
1517 }
1518
1519 mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
1520 EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1521 mb_check_buddy(e4b);
1522
1523 return ret;
1524}
1525
1526/*
1527 * Must be called under group lock!
1528 */
1529static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1530 struct ext4_buddy *e4b)
1531{
1532 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1533 int ret;
1534
1535 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1536 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1537
1538 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1539 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1540 ret = mb_mark_used(e4b, &ac->ac_b_ex);
1541
1542 /* preallocation can change ac_b_ex, thus we store actually
1543 * allocated blocks for history */
1544 ac->ac_f_ex = ac->ac_b_ex;
1545
1546 ac->ac_status = AC_STATUS_FOUND;
1547 ac->ac_tail = ret & 0xffff;
1548 ac->ac_buddy = ret >> 16;
1549
1550 /* XXXXXXX: SUCH A HORRIBLE **CK */
1551 /*FIXME!! Why ? */
1552 ac->ac_bitmap_page = e4b->bd_bitmap_page;
1553 get_page(ac->ac_bitmap_page);
1554 ac->ac_buddy_page = e4b->bd_buddy_page;
1555 get_page(ac->ac_buddy_page);
1556
1557 /* store last allocated for subsequent stream allocation */
1558 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
1559 spin_lock(&sbi->s_md_lock);
1560 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1561 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1562 spin_unlock(&sbi->s_md_lock);
1563 }
1564}
1565
1566/*
1567 * regular allocator, for general purposes allocation
1568 */
1569
1570static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1571 struct ext4_buddy *e4b,
1572 int finish_group)
1573{
1574 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1575 struct ext4_free_extent *bex = &ac->ac_b_ex;
1576 struct ext4_free_extent *gex = &ac->ac_g_ex;
1577 struct ext4_free_extent ex;
1578 int max;
1579
1580 /*
1581 * We don't want to scan for a whole year
1582 */
1583 if (ac->ac_found > sbi->s_mb_max_to_scan &&
1584 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1585 ac->ac_status = AC_STATUS_BREAK;
1586 return;
1587 }
1588
1589 /*
1590 * Haven't found good chunk so far, let's continue
1591 */
1592 if (bex->fe_len < gex->fe_len)
1593 return;
1594
1595 if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1596 && bex->fe_group == e4b->bd_group) {
1597 /* recheck chunk's availability - we don't know
1598 * when it was found (within this lock-unlock
1599 * period or not) */
1600 max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
1601 if (max >= gex->fe_len) {
1602 ext4_mb_use_best_found(ac, e4b);
1603 return;
1604 }
1605 }
1606}
1607
1608/*
1609 * The routine checks whether found extent is good enough. If it is,
1610 * then the extent gets marked used and flag is set to the context
1611 * to stop scanning. Otherwise, the extent is compared with the
1612 * previous found extent and if new one is better, then it's stored
1613 * in the context. Later, the best found extent will be used, if
1614 * mballoc can't find good enough extent.
1615 *
1616 * FIXME: real allocation policy is to be designed yet!
1617 */
1618static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1619 struct ext4_free_extent *ex,
1620 struct ext4_buddy *e4b)
1621{
1622 struct ext4_free_extent *bex = &ac->ac_b_ex;
1623 struct ext4_free_extent *gex = &ac->ac_g_ex;
1624
1625 BUG_ON(ex->fe_len <= 0);
1626 BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1627 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1628 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1629
1630 ac->ac_found++;
1631
1632 /*
1633 * The special case - take what you catch first
1634 */
1635 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1636 *bex = *ex;
1637 ext4_mb_use_best_found(ac, e4b);
1638 return;
1639 }
1640
1641 /*
1642 * Let's check whether the chuck is good enough
1643 */
1644 if (ex->fe_len == gex->fe_len) {
1645 *bex = *ex;
1646 ext4_mb_use_best_found(ac, e4b);
1647 return;
1648 }
1649
1650 /*
1651 * If this is first found extent, just store it in the context
1652 */
1653 if (bex->fe_len == 0) {
1654 *bex = *ex;
1655 return;
1656 }
1657
1658 /*
1659 * If new found extent is better, store it in the context
1660 */
1661 if (bex->fe_len < gex->fe_len) {
1662 /* if the request isn't satisfied, any found extent
1663 * larger than previous best one is better */
1664 if (ex->fe_len > bex->fe_len)
1665 *bex = *ex;
1666 } else if (ex->fe_len > gex->fe_len) {
1667 /* if the request is satisfied, then we try to find
1668 * an extent that still satisfy the request, but is
1669 * smaller than previous one */
1670 if (ex->fe_len < bex->fe_len)
1671 *bex = *ex;
1672 }
1673
1674 ext4_mb_check_limits(ac, e4b, 0);
1675}
1676
1677static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1678 struct ext4_buddy *e4b)
1679{
1680 struct ext4_free_extent ex = ac->ac_b_ex;
1681 ext4_group_t group = ex.fe_group;
1682 int max;
1683 int err;
1684
1685 BUG_ON(ex.fe_len <= 0);
1686 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1687 if (err)
1688 return err;
1689
1690 ext4_lock_group(ac->ac_sb, group);
1691 max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
1692
1693 if (max > 0) {
1694 ac->ac_b_ex = ex;
1695 ext4_mb_use_best_found(ac, e4b);
1696 }
1697
1698 ext4_unlock_group(ac->ac_sb, group);
1699 ext4_mb_release_desc(e4b);
1700
1701 return 0;
1702}
1703
1704static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1705 struct ext4_buddy *e4b)
1706{
1707 ext4_group_t group = ac->ac_g_ex.fe_group;
1708 int max;
1709 int err;
1710 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1711 struct ext4_super_block *es = sbi->s_es;
1712 struct ext4_free_extent ex;
1713
1714 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1715 return 0;
1716
1717 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1718 if (err)
1719 return err;
1720
1721 ext4_lock_group(ac->ac_sb, group);
1722 max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
1723 ac->ac_g_ex.fe_len, &ex);
1724
1725 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1726 ext4_fsblk_t start;
1727
1728 start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) +
1729 ex.fe_start + le32_to_cpu(es->s_first_data_block);
1730 /* use do_div to get remainder (would be 64-bit modulo) */
1731 if (do_div(start, sbi->s_stripe) == 0) {
1732 ac->ac_found++;
1733 ac->ac_b_ex = ex;
1734 ext4_mb_use_best_found(ac, e4b);
1735 }
1736 } else if (max >= ac->ac_g_ex.fe_len) {
1737 BUG_ON(ex.fe_len <= 0);
1738 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1739 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1740 ac->ac_found++;
1741 ac->ac_b_ex = ex;
1742 ext4_mb_use_best_found(ac, e4b);
1743 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1744 /* Sometimes, caller may want to merge even small
1745 * number of blocks to an existing extent */
1746 BUG_ON(ex.fe_len <= 0);
1747 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1748 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1749 ac->ac_found++;
1750 ac->ac_b_ex = ex;
1751 ext4_mb_use_best_found(ac, e4b);
1752 }
1753 ext4_unlock_group(ac->ac_sb, group);
1754 ext4_mb_release_desc(e4b);
1755
1756 return 0;
1757}
1758
1759/*
1760 * The routine scans buddy structures (not bitmap!) from given order
1761 * to max order and tries to find big enough chunk to satisfy the req
1762 */
1763static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1764 struct ext4_buddy *e4b)
1765{
1766 struct super_block *sb = ac->ac_sb;
1767 struct ext4_group_info *grp = e4b->bd_info;
1768 void *buddy;
1769 int i;
1770 int k;
1771 int max;
1772
1773 BUG_ON(ac->ac_2order <= 0);
1774 for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1775 if (grp->bb_counters[i] == 0)
1776 continue;
1777
1778 buddy = mb_find_buddy(e4b, i, &max);
1779 BUG_ON(buddy == NULL);
1780
1781 k = ext4_find_next_zero_bit(buddy, max, 0);
1782 BUG_ON(k >= max);
1783
1784 ac->ac_found++;
1785
1786 ac->ac_b_ex.fe_len = 1 << i;
1787 ac->ac_b_ex.fe_start = k << i;
1788 ac->ac_b_ex.fe_group = e4b->bd_group;
1789
1790 ext4_mb_use_best_found(ac, e4b);
1791
1792 BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1793
1794 if (EXT4_SB(sb)->s_mb_stats)
1795 atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
1796
1797 break;
1798 }
1799}
1800
1801/*
1802 * The routine scans the group and measures all found extents.
1803 * In order to optimize scanning, caller must pass number of
1804 * free blocks in the group, so the routine can know upper limit.
1805 */
1806static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1807 struct ext4_buddy *e4b)
1808{
1809 struct super_block *sb = ac->ac_sb;
1810 void *bitmap = EXT4_MB_BITMAP(e4b);
1811 struct ext4_free_extent ex;
1812 int i;
1813 int free;
1814
1815 free = e4b->bd_info->bb_free;
1816 BUG_ON(free <= 0);
1817
1818 i = e4b->bd_info->bb_first_free;
1819
1820 while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1821 i = ext4_find_next_zero_bit(bitmap,
1822 EXT4_BLOCKS_PER_GROUP(sb), i);
1823 if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
1824 BUG_ON(free != 0);
1825 break;
1826 }
1827
1828 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
1829 BUG_ON(ex.fe_len <= 0);
1830 BUG_ON(free < ex.fe_len);
1831
1832 ext4_mb_measure_extent(ac, &ex, e4b);
1833
1834 i += ex.fe_len;
1835 free -= ex.fe_len;
1836 }
1837
1838 ext4_mb_check_limits(ac, e4b, 1);
1839}
1840
1841/*
1842 * This is a special case for storages like raid5
1843 * we try to find stripe-aligned chunks for stripe-size requests
1844 * XXX should do so at least for multiples of stripe size as well
1845 */
1846static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1847 struct ext4_buddy *e4b)
1848{
1849 struct super_block *sb = ac->ac_sb;
1850 struct ext4_sb_info *sbi = EXT4_SB(sb);
1851 void *bitmap = EXT4_MB_BITMAP(e4b);
1852 struct ext4_free_extent ex;
1853 ext4_fsblk_t first_group_block;
1854 ext4_fsblk_t a;
1855 ext4_grpblk_t i;
1856 int max;
1857
1858 BUG_ON(sbi->s_stripe == 0);
1859
1860 /* find first stripe-aligned block in group */
1861 first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb)
1862 + le32_to_cpu(sbi->s_es->s_first_data_block);
1863 a = first_group_block + sbi->s_stripe - 1;
1864 do_div(a, sbi->s_stripe);
1865 i = (a * sbi->s_stripe) - first_group_block;
1866
1867 while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
1868 if (!mb_test_bit(i, bitmap)) {
1869 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
1870 if (max >= sbi->s_stripe) {
1871 ac->ac_found++;
1872 ac->ac_b_ex = ex;
1873 ext4_mb_use_best_found(ac, e4b);
1874 break;
1875 }
1876 }
1877 i += sbi->s_stripe;
1878 }
1879}
1880
1881static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1882 ext4_group_t group, int cr)
1883{
1884 unsigned free, fragments;
1885 unsigned i, bits;
1886 struct ext4_group_desc *desc;
1887 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1888
1889 BUG_ON(cr < 0 || cr >= 4);
1890 BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
1891
1892 free = grp->bb_free;
1893 fragments = grp->bb_fragments;
1894 if (free == 0)
1895 return 0;
1896 if (fragments == 0)
1897 return 0;
1898
1899 switch (cr) {
1900 case 0:
1901 BUG_ON(ac->ac_2order == 0);
1902 /* If this group is uninitialized, skip it initially */
1903 desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
1904 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
1905 return 0;
1906
1907 bits = ac->ac_sb->s_blocksize_bits + 1;
1908 for (i = ac->ac_2order; i <= bits; i++)
1909 if (grp->bb_counters[i] > 0)
1910 return 1;
1911 break;
1912 case 1:
1913 if ((free / fragments) >= ac->ac_g_ex.fe_len)
1914 return 1;
1915 break;
1916 case 2:
1917 if (free >= ac->ac_g_ex.fe_len)
1918 return 1;
1919 break;
1920 case 3:
1921 return 1;
1922 default:
1923 BUG();
1924 }
1925
1926 return 0;
1927}
1928
1929static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1930{
1931 ext4_group_t group;
1932 ext4_group_t i;
1933 int cr;
1934 int err = 0;
1935 int bsbits;
1936 struct ext4_sb_info *sbi;
1937 struct super_block *sb;
1938 struct ext4_buddy e4b;
1939 loff_t size, isize;
1940
1941 sb = ac->ac_sb;
1942 sbi = EXT4_SB(sb);
1943 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1944
1945 /* first, try the goal */
1946 err = ext4_mb_find_by_goal(ac, &e4b);
1947 if (err || ac->ac_status == AC_STATUS_FOUND)
1948 goto out;
1949
1950 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
1951 goto out;
1952
1953 /*
1954 * ac->ac2_order is set only if the fe_len is a power of 2
1955 * if ac2_order is set we also set criteria to 0 so that we
1956 * try exact allocation using buddy.
1957 */
1958 i = fls(ac->ac_g_ex.fe_len);
1959 ac->ac_2order = 0;
1960 /*
1961 * We search using buddy data only if the order of the request
1962 * is greater than equal to the sbi_s_mb_order2_reqs
1963 * You can tune it via /proc/fs/ext4/<partition>/order2_req
1964 */
1965 if (i >= sbi->s_mb_order2_reqs) {
1966 /*
1967 * This should tell if fe_len is exactly power of 2
1968 */
1969 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
1970 ac->ac_2order = i - 1;
1971 }
1972
1973 bsbits = ac->ac_sb->s_blocksize_bits;
1974 /* if stream allocation is enabled, use global goal */
1975 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
1976 isize = i_size_read(ac->ac_inode) >> bsbits;
1977 if (size < isize)
1978 size = isize;
1979
1980 if (size < sbi->s_mb_stream_request &&
1981 (ac->ac_flags & EXT4_MB_HINT_DATA)) {
1982 /* TBD: may be hot point */
1983 spin_lock(&sbi->s_md_lock);
1984 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
1985 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
1986 spin_unlock(&sbi->s_md_lock);
1987 }
1988
1989 /* searching for the right group start from the goal value specified */
1990 group = ac->ac_g_ex.fe_group;
1991
1992 /* Let's just scan groups to find more-less suitable blocks */
1993 cr = ac->ac_2order ? 0 : 1;
1994 /*
1995 * cr == 0 try to get exact allocation,
1996 * cr == 3 try to get anything
1997 */
1998repeat:
1999 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2000 ac->ac_criteria = cr;
2001 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
2002 struct ext4_group_info *grp;
2003 struct ext4_group_desc *desc;
2004
2005 if (group == EXT4_SB(sb)->s_groups_count)
2006 group = 0;
2007
2008 /* quick check to skip empty groups */
2009 grp = ext4_get_group_info(ac->ac_sb, group);
2010 if (grp->bb_free == 0)
2011 continue;
2012
2013 /*
2014 * if the group is already init we check whether it is
2015 * a good group and if not we don't load the buddy
2016 */
2017 if (EXT4_MB_GRP_NEED_INIT(grp)) {
2018 /*
2019 * we need full data about the group
2020 * to make a good selection
2021 */
2022 err = ext4_mb_load_buddy(sb, group, &e4b);
2023 if (err)
2024 goto out;
2025 ext4_mb_release_desc(&e4b);
2026 }
2027
2028 /*
2029 * If the particular group doesn't satisfy our
2030 * criteria we continue with the next group
2031 */
2032 if (!ext4_mb_good_group(ac, group, cr))
2033 continue;
2034
2035 err = ext4_mb_load_buddy(sb, group, &e4b);
2036 if (err)
2037 goto out;
2038
2039 ext4_lock_group(sb, group);
2040 if (!ext4_mb_good_group(ac, group, cr)) {
2041 /* someone did allocation from this group */
2042 ext4_unlock_group(sb, group);
2043 ext4_mb_release_desc(&e4b);
2044 continue;
2045 }
2046
2047 ac->ac_groups_scanned++;
2048 desc = ext4_get_group_desc(sb, group, NULL);
2049 if (cr == 0 || (desc->bg_flags &
2050 cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
2051 ac->ac_2order != 0))
2052 ext4_mb_simple_scan_group(ac, &e4b);
2053 else if (cr == 1 &&
2054 ac->ac_g_ex.fe_len == sbi->s_stripe)
2055 ext4_mb_scan_aligned(ac, &e4b);
2056 else
2057 ext4_mb_complex_scan_group(ac, &e4b);
2058
2059 ext4_unlock_group(sb, group);
2060 ext4_mb_release_desc(&e4b);
2061
2062 if (ac->ac_status != AC_STATUS_CONTINUE)
2063 break;
2064 }
2065 }
2066
2067 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2068 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2069 /*
2070 * We've been searching too long. Let's try to allocate
2071 * the best chunk we've found so far
2072 */
2073
2074 ext4_mb_try_best_found(ac, &e4b);
2075 if (ac->ac_status != AC_STATUS_FOUND) {
2076 /*
2077 * Someone more lucky has already allocated it.
2078 * The only thing we can do is just take first
2079 * found block(s)
2080 printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
2081 */
2082 ac->ac_b_ex.fe_group = 0;
2083 ac->ac_b_ex.fe_start = 0;
2084 ac->ac_b_ex.fe_len = 0;
2085 ac->ac_status = AC_STATUS_CONTINUE;
2086 ac->ac_flags |= EXT4_MB_HINT_FIRST;
2087 cr = 3;
2088 atomic_inc(&sbi->s_mb_lost_chunks);
2089 goto repeat;
2090 }
2091 }
2092out:
2093 return err;
2094}
2095
2096#ifdef EXT4_MB_HISTORY
2097struct ext4_mb_proc_session {
2098 struct ext4_mb_history *history;
2099 struct super_block *sb;
2100 int start;
2101 int max;
2102};
2103
2104static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
2105 struct ext4_mb_history *hs,
2106 int first)
2107{
2108 if (hs == s->history + s->max)
2109 hs = s->history;
2110 if (!first && hs == s->history + s->start)
2111 return NULL;
2112 while (hs->orig.fe_len == 0) {
2113 hs++;
2114 if (hs == s->history + s->max)
2115 hs = s->history;
2116 if (hs == s->history + s->start)
2117 return NULL;
2118 }
2119 return hs;
2120}
2121
2122static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
2123{
2124 struct ext4_mb_proc_session *s = seq->private;
2125 struct ext4_mb_history *hs;
2126 int l = *pos;
2127
2128 if (l == 0)
2129 return SEQ_START_TOKEN;
2130 hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2131 if (!hs)
2132 return NULL;
2133 while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
2134 return hs;
2135}
2136
2137static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
2138 loff_t *pos)
2139{
2140 struct ext4_mb_proc_session *s = seq->private;
2141 struct ext4_mb_history *hs = v;
2142
2143 ++*pos;
2144 if (v == SEQ_START_TOKEN)
2145 return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2146 else
2147 return ext4_mb_history_skip_empty(s, ++hs, 0);
2148}
2149
2150static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2151{
2152 char buf[25], buf2[25], buf3[25], *fmt;
2153 struct ext4_mb_history *hs = v;
2154
2155 if (v == SEQ_START_TOKEN) {
2156 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2157 "%-5s %-2s %-5s %-5s %-5s %-6s\n",
2158 "pid", "inode", "original", "goal", "result", "found",
2159 "grps", "cr", "flags", "merge", "tail", "broken");
2160 return 0;
2161 }
2162
2163 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2164 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2165 "%-5u %-5s %-5u %-6u\n";
2166 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
2167 hs->result.fe_start, hs->result.fe_len,
2168 hs->result.fe_logical);
2169 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
2170 hs->orig.fe_start, hs->orig.fe_len,
2171 hs->orig.fe_logical);
2172 sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group,
2173 hs->goal.fe_start, hs->goal.fe_len,
2174 hs->goal.fe_logical);
2175 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
2176 hs->found, hs->groups, hs->cr, hs->flags,
2177 hs->merged ? "M" : "", hs->tail,
2178 hs->buddy ? 1 << hs->buddy : 0);
2179 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
2180 fmt = "%-5u %-8u %-23s %-23s %-23s\n";
2181 sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
2182 hs->result.fe_start, hs->result.fe_len,
2183 hs->result.fe_logical);
2184 sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
2185 hs->orig.fe_start, hs->orig.fe_len,
2186 hs->orig.fe_logical);
2187 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
2188 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
2189 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
2190 hs->result.fe_start, hs->result.fe_len);
2191 seq_printf(seq, "%-5u %-8u %-23s discard\n",
2192 hs->pid, hs->ino, buf2);
2193 } else if (hs->op == EXT4_MB_HISTORY_FREE) {
2194 sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
2195 hs->result.fe_start, hs->result.fe_len);
2196 seq_printf(seq, "%-5u %-8u %-23s free\n",
2197 hs->pid, hs->ino, buf2);
2198 }
2199 return 0;
2200}
2201
2202static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2203{
2204}
2205
2206static struct seq_operations ext4_mb_seq_history_ops = {
2207 .start = ext4_mb_seq_history_start,
2208 .next = ext4_mb_seq_history_next,
2209 .stop = ext4_mb_seq_history_stop,
2210 .show = ext4_mb_seq_history_show,
2211};
2212
2213static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
2214{
2215 struct super_block *sb = PDE(inode)->data;
2216 struct ext4_sb_info *sbi = EXT4_SB(sb);
2217 struct ext4_mb_proc_session *s;
2218 int rc;
2219 int size;
2220
2221 s = kmalloc(sizeof(*s), GFP_KERNEL);
2222 if (s == NULL)
2223 return -ENOMEM;
2224 s->sb = sb;
2225 size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
2226 s->history = kmalloc(size, GFP_KERNEL);
2227 if (s->history == NULL) {
2228 kfree(s);
2229 return -ENOMEM;
2230 }
2231
2232 spin_lock(&sbi->s_mb_history_lock);
2233 memcpy(s->history, sbi->s_mb_history, size);
2234 s->max = sbi->s_mb_history_max;
2235 s->start = sbi->s_mb_history_cur % s->max;
2236 spin_unlock(&sbi->s_mb_history_lock);
2237
2238 rc = seq_open(file, &ext4_mb_seq_history_ops);
2239 if (rc == 0) {
2240 struct seq_file *m = (struct seq_file *)file->private_data;
2241 m->private = s;
2242 } else {
2243 kfree(s->history);
2244 kfree(s);
2245 }
2246 return rc;
2247
2248}
2249
2250static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
2251{
2252 struct seq_file *seq = (struct seq_file *)file->private_data;
2253 struct ext4_mb_proc_session *s = seq->private;
2254 kfree(s->history);
2255 kfree(s);
2256 return seq_release(inode, file);
2257}
2258
2259static ssize_t ext4_mb_seq_history_write(struct file *file,
2260 const char __user *buffer,
2261 size_t count, loff_t *ppos)
2262{
2263 struct seq_file *seq = (struct seq_file *)file->private_data;
2264 struct ext4_mb_proc_session *s = seq->private;
2265 struct super_block *sb = s->sb;
2266 char str[32];
2267 int value;
2268
2269 if (count >= sizeof(str)) {
2270 printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
2271 "mb_history", (int)sizeof(str));
2272 return -EOVERFLOW;
2273 }
2274
2275 if (copy_from_user(str, buffer, count))
2276 return -EFAULT;
2277
2278 value = simple_strtol(str, NULL, 0);
2279 if (value < 0)
2280 return -ERANGE;
2281 EXT4_SB(sb)->s_mb_history_filter = value;
2282
2283 return count;
2284}
2285
2286static struct file_operations ext4_mb_seq_history_fops = {
2287 .owner = THIS_MODULE,
2288 .open = ext4_mb_seq_history_open,
2289 .read = seq_read,
2290 .write = ext4_mb_seq_history_write,
2291 .llseek = seq_lseek,
2292 .release = ext4_mb_seq_history_release,
2293};
2294
2295static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2296{
2297 struct super_block *sb = seq->private;
2298 struct ext4_sb_info *sbi = EXT4_SB(sb);
2299 ext4_group_t group;
2300
2301 if (*pos < 0 || *pos >= sbi->s_groups_count)
2302 return NULL;
2303
2304 group = *pos + 1;
2305 return (void *) group;
2306}
2307
2308static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2309{
2310 struct super_block *sb = seq->private;
2311 struct ext4_sb_info *sbi = EXT4_SB(sb);
2312 ext4_group_t group;
2313
2314 ++*pos;
2315 if (*pos < 0 || *pos >= sbi->s_groups_count)
2316 return NULL;
2317 group = *pos + 1;
2318 return (void *) group;;
2319}
2320
2321static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2322{
2323 struct super_block *sb = seq->private;
2324 long group = (long) v;
2325 int i;
2326 int err;
2327 struct ext4_buddy e4b;
2328 struct sg {
2329 struct ext4_group_info info;
2330 unsigned short counters[16];
2331 } sg;
2332
2333 group--;
2334 if (group == 0)
2335 seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
2336 "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
2337 "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
2338 "group", "free", "frags", "first",
2339 "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
2340 "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
2341
2342 i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2343 sizeof(struct ext4_group_info);
2344 err = ext4_mb_load_buddy(sb, group, &e4b);
2345 if (err) {
2346 seq_printf(seq, "#%-5lu: I/O error\n", group);
2347 return 0;
2348 }
2349 ext4_lock_group(sb, group);
2350 memcpy(&sg, ext4_get_group_info(sb, group), i);
2351 ext4_unlock_group(sb, group);
2352 ext4_mb_release_desc(&e4b);
2353
2354 seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
2355 sg.info.bb_fragments, sg.info.bb_first_free);
2356 for (i = 0; i <= 13; i++)
2357 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
2358 sg.info.bb_counters[i] : 0);
2359 seq_printf(seq, " ]\n");
2360
2361 return 0;
2362}
2363
2364static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2365{
2366}
2367
2368static struct seq_operations ext4_mb_seq_groups_ops = {
2369 .start = ext4_mb_seq_groups_start,
2370 .next = ext4_mb_seq_groups_next,
2371 .stop = ext4_mb_seq_groups_stop,
2372 .show = ext4_mb_seq_groups_show,
2373};
2374
2375static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2376{
2377 struct super_block *sb = PDE(inode)->data;
2378 int rc;
2379
2380 rc = seq_open(file, &ext4_mb_seq_groups_ops);
2381 if (rc == 0) {
2382 struct seq_file *m = (struct seq_file *)file->private_data;
2383 m->private = sb;
2384 }
2385 return rc;
2386
2387}
2388
2389static struct file_operations ext4_mb_seq_groups_fops = {
2390 .owner = THIS_MODULE,
2391 .open = ext4_mb_seq_groups_open,
2392 .read = seq_read,
2393 .llseek = seq_lseek,
2394 .release = seq_release,
2395};
2396
2397static void ext4_mb_history_release(struct super_block *sb)
2398{
2399 struct ext4_sb_info *sbi = EXT4_SB(sb);
2400
2401 remove_proc_entry("mb_groups", sbi->s_mb_proc);
2402 remove_proc_entry("mb_history", sbi->s_mb_proc);
2403
2404 kfree(sbi->s_mb_history);
2405}
2406
2407static void ext4_mb_history_init(struct super_block *sb)
2408{
2409 struct ext4_sb_info *sbi = EXT4_SB(sb);
2410 int i;
2411
2412 if (sbi->s_mb_proc != NULL) {
2413 struct proc_dir_entry *p;
2414 p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
2415 if (p) {
2416 p->proc_fops = &ext4_mb_seq_history_fops;
2417 p->data = sb;
2418 }
2419 p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc);
2420 if (p) {
2421 p->proc_fops = &ext4_mb_seq_groups_fops;
2422 p->data = sb;
2423 }
2424 }
2425
2426 sbi->s_mb_history_max = 1000;
2427 sbi->s_mb_history_cur = 0;
2428 spin_lock_init(&sbi->s_mb_history_lock);
2429 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2430 sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
2431 if (likely(sbi->s_mb_history != NULL))
2432 memset(sbi->s_mb_history, 0, i);
2433 /* if we can't allocate history, then we simple won't use it */
2434}
2435
2436static void ext4_mb_store_history(struct ext4_allocation_context *ac)
2437{
2438 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2439 struct ext4_mb_history h;
2440
2441 if (unlikely(sbi->s_mb_history == NULL))
2442 return;
2443
2444 if (!(ac->ac_op & sbi->s_mb_history_filter))
2445 return;
2446
2447 h.op = ac->ac_op;
2448 h.pid = current->pid;
2449 h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
2450 h.orig = ac->ac_o_ex;
2451 h.result = ac->ac_b_ex;
2452 h.flags = ac->ac_flags;
2453 h.found = ac->ac_found;
2454 h.groups = ac->ac_groups_scanned;
2455 h.cr = ac->ac_criteria;
2456 h.tail = ac->ac_tail;
2457 h.buddy = ac->ac_buddy;
2458 h.merged = 0;
2459 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
2460 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
2461 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
2462 h.merged = 1;
2463 h.goal = ac->ac_g_ex;
2464 h.result = ac->ac_f_ex;
2465 }
2466
2467 spin_lock(&sbi->s_mb_history_lock);
2468 memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
2469 if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
2470 sbi->s_mb_history_cur = 0;
2471 spin_unlock(&sbi->s_mb_history_lock);
2472}
2473
2474#else
2475#define ext4_mb_history_release(sb)
2476#define ext4_mb_history_init(sb)
2477#endif
2478
2479static int ext4_mb_init_backend(struct super_block *sb)
2480{
2481 ext4_group_t i;
2482 int j, len, metalen;
2483 struct ext4_sb_info *sbi = EXT4_SB(sb);
2484 int num_meta_group_infos =
2485 (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
2486 EXT4_DESC_PER_BLOCK_BITS(sb);
2487 struct ext4_group_info **meta_group_info;
2488
2489 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2490 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2491 * So a two level scheme suffices for now. */
2492 sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
2493 num_meta_group_infos, GFP_KERNEL);
2494 if (sbi->s_group_info == NULL) {
2495 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
2496 return -ENOMEM;
2497 }
2498 sbi->s_buddy_cache = new_inode(sb);
2499 if (sbi->s_buddy_cache == NULL) {
2500 printk(KERN_ERR "EXT4-fs: can't get new inode\n");
2501 goto err_freesgi;
2502 }
2503 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2504
2505 metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
2506 for (i = 0; i < num_meta_group_infos; i++) {
2507 if ((i + 1) == num_meta_group_infos)
2508 metalen = sizeof(*meta_group_info) *
2509 (sbi->s_groups_count -
2510 (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
2511 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2512 if (meta_group_info == NULL) {
2513 printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2514 "buddy group\n");
2515 goto err_freemeta;
2516 }
2517 sbi->s_group_info[i] = meta_group_info;
2518 }
2519
2520 /*
2521 * calculate needed size. if change bb_counters size,
2522 * don't forget about ext4_mb_generate_buddy()
2523 */
2524 len = sizeof(struct ext4_group_info);
2525 len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
2526 for (i = 0; i < sbi->s_groups_count; i++) {
2527 struct ext4_group_desc *desc;
2528
2529 meta_group_info =
2530 sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2531 j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
2532
2533 meta_group_info[j] = kzalloc(len, GFP_KERNEL);
2534 if (meta_group_info[j] == NULL) {
2535 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2536 i--;
2537 goto err_freebuddy;
2538 }
2539 desc = ext4_get_group_desc(sb, i, NULL);
2540 if (desc == NULL) {
2541 printk(KERN_ERR
2542 "EXT4-fs: can't read descriptor %lu\n", i);
2543 goto err_freebuddy;
2544 }
2545 memset(meta_group_info[j], 0, len);
2546 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2547 &(meta_group_info[j]->bb_state));
2548
2549 /*
2550 * initialize bb_free to be able to skip
2551 * empty groups without initialization
2552 */
2553 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2554 meta_group_info[j]->bb_free =
2555 ext4_free_blocks_after_init(sb, i, desc);
2556 } else {
2557 meta_group_info[j]->bb_free =
2558 le16_to_cpu(desc->bg_free_blocks_count);
2559 }
2560
2561 INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
2562
2563#ifdef DOUBLE_CHECK
2564 {
2565 struct buffer_head *bh;
2566 meta_group_info[j]->bb_bitmap =
2567 kmalloc(sb->s_blocksize, GFP_KERNEL);
2568 BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
2569 bh = read_block_bitmap(sb, i);
2570 BUG_ON(bh == NULL);
2571 memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
2572 sb->s_blocksize);
2573 put_bh(bh);
2574 }
2575#endif
2576
2577 }
2578
2579 return 0;
2580
2581err_freebuddy:
2582 while (i >= 0) {
2583 kfree(ext4_get_group_info(sb, i));
2584 i--;
2585 }
2586 i = num_meta_group_infos;
2587err_freemeta:
2588 while (--i >= 0)
2589 kfree(sbi->s_group_info[i]);
2590 iput(sbi->s_buddy_cache);
2591err_freesgi:
2592 kfree(sbi->s_group_info);
2593 return -ENOMEM;
2594}
2595
2596int ext4_mb_init(struct super_block *sb, int needs_recovery)
2597{
2598 struct ext4_sb_info *sbi = EXT4_SB(sb);
2599 unsigned i;
2600 unsigned offset;
2601 unsigned max;
2602
2603 if (!test_opt(sb, MBALLOC))
2604 return 0;
2605
2606 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
2607
2608 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2609 if (sbi->s_mb_offsets == NULL) {
2610 clear_opt(sbi->s_mount_opt, MBALLOC);
2611 return -ENOMEM;
2612 }
2613 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2614 if (sbi->s_mb_maxs == NULL) {
2615 clear_opt(sbi->s_mount_opt, MBALLOC);
2616 kfree(sbi->s_mb_maxs);
2617 return -ENOMEM;
2618 }
2619
2620 /* order 0 is regular bitmap */
2621 sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2622 sbi->s_mb_offsets[0] = 0;
2623
2624 i = 1;
2625 offset = 0;
2626 max = sb->s_blocksize << 2;
2627 do {
2628 sbi->s_mb_offsets[i] = offset;
2629 sbi->s_mb_maxs[i] = max;
2630 offset += 1 << (sb->s_blocksize_bits - i);
2631 max = max >> 1;
2632 i++;
2633 } while (i <= sb->s_blocksize_bits + 1);
2634
2635 /* init file for buddy data */
2636 i = ext4_mb_init_backend(sb);
2637 if (i) {
2638 clear_opt(sbi->s_mount_opt, MBALLOC);
2639 kfree(sbi->s_mb_offsets);
2640 kfree(sbi->s_mb_maxs);
2641 return i;
2642 }
2643
2644 spin_lock_init(&sbi->s_md_lock);
2645 INIT_LIST_HEAD(&sbi->s_active_transaction);
2646 INIT_LIST_HEAD(&sbi->s_closed_transaction);
2647 INIT_LIST_HEAD(&sbi->s_committed_transaction);
2648 spin_lock_init(&sbi->s_bal_lock);
2649
2650 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2651 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2652 sbi->s_mb_stats = MB_DEFAULT_STATS;
2653 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2654 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2655 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2656 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2657
2658 i = sizeof(struct ext4_locality_group) * NR_CPUS;
2659 sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
2660 if (sbi->s_locality_groups == NULL) {
2661 clear_opt(sbi->s_mount_opt, MBALLOC);
2662 kfree(sbi->s_mb_offsets);
2663 kfree(sbi->s_mb_maxs);
2664 return -ENOMEM;
2665 }
2666 for (i = 0; i < NR_CPUS; i++) {
2667 struct ext4_locality_group *lg;
2668 lg = &sbi->s_locality_groups[i];
2669 mutex_init(&lg->lg_mutex);
2670 INIT_LIST_HEAD(&lg->lg_prealloc_list);
2671 spin_lock_init(&lg->lg_prealloc_lock);
2672 }
2673
2674 ext4_mb_init_per_dev_proc(sb);
2675 ext4_mb_history_init(sb);
2676
2677 printk("EXT4-fs: mballoc enabled\n");
2678 return 0;
2679}
2680
2681/* need to called with ext4 group lock (ext4_lock_group) */
2682static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2683{
2684 struct ext4_prealloc_space *pa;
2685 struct list_head *cur, *tmp;
2686 int count = 0;
2687
2688 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
2689 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2690 list_del(&pa->pa_group_list);
2691 count++;
2692 kfree(pa);
2693 }
2694 if (count)
2695 mb_debug("mballoc: %u PAs left\n", count);
2696
2697}
2698
2699int ext4_mb_release(struct super_block *sb)
2700{
2701 ext4_group_t i;
2702 int num_meta_group_infos;
2703 struct ext4_group_info *grinfo;
2704 struct ext4_sb_info *sbi = EXT4_SB(sb);
2705
2706 if (!test_opt(sb, MBALLOC))
2707 return 0;
2708
2709 /* release freed, non-committed blocks */
2710 spin_lock(&sbi->s_md_lock);
2711 list_splice_init(&sbi->s_closed_transaction,
2712 &sbi->s_committed_transaction);
2713 list_splice_init(&sbi->s_active_transaction,
2714 &sbi->s_committed_transaction);
2715 spin_unlock(&sbi->s_md_lock);
2716 ext4_mb_free_committed_blocks(sb);
2717
2718 if (sbi->s_group_info) {
2719 for (i = 0; i < sbi->s_groups_count; i++) {
2720 grinfo = ext4_get_group_info(sb, i);
2721#ifdef DOUBLE_CHECK
2722 kfree(grinfo->bb_bitmap);
2723#endif
2724 ext4_lock_group(sb, i);
2725 ext4_mb_cleanup_pa(grinfo);
2726 ext4_unlock_group(sb, i);
2727 kfree(grinfo);
2728 }
2729 num_meta_group_infos = (sbi->s_groups_count +
2730 EXT4_DESC_PER_BLOCK(sb) - 1) >>
2731 EXT4_DESC_PER_BLOCK_BITS(sb);
2732 for (i = 0; i < num_meta_group_infos; i++)
2733 kfree(sbi->s_group_info[i]);
2734 kfree(sbi->s_group_info);
2735 }
2736 kfree(sbi->s_mb_offsets);
2737 kfree(sbi->s_mb_maxs);
2738 if (sbi->s_buddy_cache)
2739 iput(sbi->s_buddy_cache);
2740 if (sbi->s_mb_stats) {
2741 printk(KERN_INFO
2742 "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
2743 atomic_read(&sbi->s_bal_allocated),
2744 atomic_read(&sbi->s_bal_reqs),
2745 atomic_read(&sbi->s_bal_success));
2746 printk(KERN_INFO
2747 "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
2748 "%u 2^N hits, %u breaks, %u lost\n",
2749 atomic_read(&sbi->s_bal_ex_scanned),
2750 atomic_read(&sbi->s_bal_goals),
2751 atomic_read(&sbi->s_bal_2orders),
2752 atomic_read(&sbi->s_bal_breaks),
2753 atomic_read(&sbi->s_mb_lost_chunks));
2754 printk(KERN_INFO
2755 "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
2756 sbi->s_mb_buddies_generated++,
2757 sbi->s_mb_generation_time);
2758 printk(KERN_INFO
2759 "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
2760 atomic_read(&sbi->s_mb_preallocated),
2761 atomic_read(&sbi->s_mb_discarded));
2762 }
2763
2764 kfree(sbi->s_locality_groups);
2765
2766 ext4_mb_history_release(sb);
2767 ext4_mb_destroy_per_dev_proc(sb);
2768
2769 return 0;
2770}
2771
2772static void ext4_mb_free_committed_blocks(struct super_block *sb)
2773{
2774 struct ext4_sb_info *sbi = EXT4_SB(sb);
2775 int err;
2776 int i;
2777 int count = 0;
2778 int count2 = 0;
2779 struct ext4_free_metadata *md;
2780 struct ext4_buddy e4b;
2781
2782 if (list_empty(&sbi->s_committed_transaction))
2783 return;
2784
2785 /* there is committed blocks to be freed yet */
2786 do {
2787 /* get next array of blocks */
2788 md = NULL;
2789 spin_lock(&sbi->s_md_lock);
2790 if (!list_empty(&sbi->s_committed_transaction)) {
2791 md = list_entry(sbi->s_committed_transaction.next,
2792 struct ext4_free_metadata, list);
2793 list_del(&md->list);
2794 }
2795 spin_unlock(&sbi->s_md_lock);
2796
2797 if (md == NULL)
2798 break;
2799
2800 mb_debug("gonna free %u blocks in group %lu (0x%p):",
2801 md->num, md->group, md);
2802
2803 err = ext4_mb_load_buddy(sb, md->group, &e4b);
2804 /* we expect to find existing buddy because it's pinned */
2805 BUG_ON(err != 0);
2806
2807 /* there are blocks to put in buddy to make them really free */
2808 count += md->num;
2809 count2++;
2810 ext4_lock_group(sb, md->group);
2811 for (i = 0; i < md->num; i++) {
2812 mb_debug(" %u", md->blocks[i]);
2813 err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
2814 BUG_ON(err != 0);
2815 }
2816 mb_debug("\n");
2817 ext4_unlock_group(sb, md->group);
2818
2819 /* balance refcounts from ext4_mb_free_metadata() */
2820 page_cache_release(e4b.bd_buddy_page);
2821 page_cache_release(e4b.bd_bitmap_page);
2822
2823 kfree(md);
2824 ext4_mb_release_desc(&e4b);
2825
2826 } while (md);
2827
2828 mb_debug("freed %u blocks in %u structures\n", count, count2);
2829}
2830
2831#define EXT4_ROOT "ext4"
2832#define EXT4_MB_STATS_NAME "stats"
2833#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan"
2834#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan"
2835#define EXT4_MB_ORDER2_REQ "order2_req"
2836#define EXT4_MB_STREAM_REQ "stream_req"
2837#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
2838
2839
2840
2841#define MB_PROC_VALUE_READ(name) \
2842static int ext4_mb_read_##name(char *page, char **start, \
2843 off_t off, int count, int *eof, void *data) \
2844{ \
2845 struct ext4_sb_info *sbi = data; \
2846 int len; \
2847 *eof = 1; \
2848 if (off != 0) \
2849 return 0; \
2850 len = sprintf(page, "%ld\n", sbi->s_mb_##name); \
2851 *start = page; \
2852 return len; \
2853}
2854
2855#define MB_PROC_VALUE_WRITE(name) \
2856static int ext4_mb_write_##name(struct file *file, \
2857 const char __user *buf, unsigned long cnt, void *data) \
2858{ \
2859 struct ext4_sb_info *sbi = data; \
2860 char str[32]; \
2861 long value; \
2862 if (cnt >= sizeof(str)) \
2863 return -EINVAL; \
2864 if (copy_from_user(str, buf, cnt)) \
2865 return -EFAULT; \
2866 value = simple_strtol(str, NULL, 0); \
2867 if (value <= 0) \
2868 return -ERANGE; \
2869 sbi->s_mb_##name = value; \
2870 return cnt; \
2871}
2872
2873MB_PROC_VALUE_READ(stats);
2874MB_PROC_VALUE_WRITE(stats);
2875MB_PROC_VALUE_READ(max_to_scan);
2876MB_PROC_VALUE_WRITE(max_to_scan);
2877MB_PROC_VALUE_READ(min_to_scan);
2878MB_PROC_VALUE_WRITE(min_to_scan);
2879MB_PROC_VALUE_READ(order2_reqs);
2880MB_PROC_VALUE_WRITE(order2_reqs);
2881MB_PROC_VALUE_READ(stream_request);
2882MB_PROC_VALUE_WRITE(stream_request);
2883MB_PROC_VALUE_READ(group_prealloc);
2884MB_PROC_VALUE_WRITE(group_prealloc);
2885
2886#define MB_PROC_HANDLER(name, var) \
2887do { \
2888 proc = create_proc_entry(name, mode, sbi->s_mb_proc); \
2889 if (proc == NULL) { \
2890 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
2891 goto err_out; \
2892 } \
2893 proc->data = sbi; \
2894 proc->read_proc = ext4_mb_read_##var ; \
2895 proc->write_proc = ext4_mb_write_##var; \
2896} while (0)
2897
2898static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2899{
2900 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
2901 struct ext4_sb_info *sbi = EXT4_SB(sb);
2902 struct proc_dir_entry *proc;
2903 char devname[64];
2904
2905 snprintf(devname, sizeof(devname) - 1, "%s",
2906 bdevname(sb->s_bdev, devname));
2907 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
2908
2909 MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
2910 MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
2911 MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
2912 MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
2913 MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
2914 MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
2915
2916 return 0;
2917
2918err_out:
2919 printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
2920 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
2921 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
2922 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
2923 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
2924 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
2925 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
2926 remove_proc_entry(devname, proc_root_ext4);
2927 sbi->s_mb_proc = NULL;
2928
2929 return -ENOMEM;
2930}
2931
2932static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
2933{
2934 struct ext4_sb_info *sbi = EXT4_SB(sb);
2935 char devname[64];
2936
2937 if (sbi->s_mb_proc == NULL)
2938 return -EINVAL;
2939
2940 snprintf(devname, sizeof(devname) - 1, "%s",
2941 bdevname(sb->s_bdev, devname));
2942 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
2943 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
2944 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
2945 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
2946 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
2947 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
2948 remove_proc_entry(devname, proc_root_ext4);
2949
2950 return 0;
2951}
2952
2953int __init init_ext4_mballoc(void)
2954{
2955 ext4_pspace_cachep =
2956 kmem_cache_create("ext4_prealloc_space",
2957 sizeof(struct ext4_prealloc_space),
2958 0, SLAB_RECLAIM_ACCOUNT, NULL);
2959 if (ext4_pspace_cachep == NULL)
2960 return -ENOMEM;
2961
2962#ifdef CONFIG_PROC_FS
2963 proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs);
2964 if (proc_root_ext4 == NULL)
2965 printk(KERN_ERR "EXT4-fs: Unable to create %s\n", EXT4_ROOT);
2966#endif
2967
2968 return 0;
2969}
2970
2971void exit_ext4_mballoc(void)
2972{
2973 /* XXX: synchronize_rcu(); */
2974 kmem_cache_destroy(ext4_pspace_cachep);
2975#ifdef CONFIG_PROC_FS
2976 remove_proc_entry(EXT4_ROOT, proc_root_fs);
2977#endif
2978}
2979
2980
2981/*
2982 * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps
2983 * Returns 0 if success or error code
2984 */
2985static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2986 handle_t *handle)
2987{
2988 struct buffer_head *bitmap_bh = NULL;
2989 struct ext4_super_block *es;
2990 struct ext4_group_desc *gdp;
2991 struct buffer_head *gdp_bh;
2992 struct ext4_sb_info *sbi;
2993 struct super_block *sb;
2994 ext4_fsblk_t block;
2995 int err;
2996
2997 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
2998 BUG_ON(ac->ac_b_ex.fe_len <= 0);
2999
3000 sb = ac->ac_sb;
3001 sbi = EXT4_SB(sb);
3002 es = sbi->s_es;
3003
3004 ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
3005 gdp->bg_free_blocks_count);
3006
3007 err = -EIO;
3008 bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
3009 if (!bitmap_bh)
3010 goto out_err;
3011
3012 err = ext4_journal_get_write_access(handle, bitmap_bh);
3013 if (err)
3014 goto out_err;
3015
3016 err = -EIO;
3017 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
3018 if (!gdp)
3019 goto out_err;
3020
3021 err = ext4_journal_get_write_access(handle, gdp_bh);
3022 if (err)
3023 goto out_err;
3024
3025 block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb)
3026 + ac->ac_b_ex.fe_start
3027 + le32_to_cpu(es->s_first_data_block);
3028
3029 if (block == ext4_block_bitmap(sb, gdp) ||
3030 block == ext4_inode_bitmap(sb, gdp) ||
3031 in_range(block, ext4_inode_table(sb, gdp),
3032 EXT4_SB(sb)->s_itb_per_group)) {
3033
3034 ext4_error(sb, __FUNCTION__,
3035 "Allocating block in system zone - block = %llu",
3036 block);
3037 }
3038#ifdef AGGRESSIVE_CHECK
3039 {
3040 int i;
3041 for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
3042 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
3043 bitmap_bh->b_data));
3044 }
3045 }
3046#endif
3047 mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
3048 ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
3049
3050 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
3051 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
3052 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3053 gdp->bg_free_blocks_count =
3054 cpu_to_le16(ext4_free_blocks_after_init(sb,
3055 ac->ac_b_ex.fe_group,
3056 gdp));
3057 }
3058 gdp->bg_free_blocks_count =
3059 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
3060 - ac->ac_b_ex.fe_len);
3061 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
3062 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
3063 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
3064
3065 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
3066 if (err)
3067 goto out_err;
3068 err = ext4_journal_dirty_metadata(handle, gdp_bh);
3069
3070out_err:
3071 sb->s_dirt = 1;
3072 put_bh(bitmap_bh);
3073 return err;
3074}
3075
3076/*
3077 * here we normalize request for locality group
3078 * Group request are normalized to s_strip size if we set the same via mount
3079 * option. If not we set it to s_mb_group_prealloc which can be configured via
3080 * /proc/fs/ext4/<partition>/group_prealloc
3081 *
3082 * XXX: should we try to preallocate more than the group has now?
3083 */
3084static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3085{
3086 struct super_block *sb = ac->ac_sb;
3087 struct ext4_locality_group *lg = ac->ac_lg;
3088
3089 BUG_ON(lg == NULL);
3090 if (EXT4_SB(sb)->s_stripe)
3091 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
3092 else
3093 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3094 mb_debug("#%u: goal %lu blocks for locality group\n",
3095 current->pid, ac->ac_g_ex.fe_len);
3096}
3097
3098/*
3099 * Normalization means making request better in terms of
3100 * size and alignment
3101 */
3102static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3103 struct ext4_allocation_request *ar)
3104{
3105 int bsbits, max;
3106 ext4_lblk_t end;
3107 struct list_head *cur;
3108 loff_t size, orig_size, start_off;
3109 ext4_lblk_t start, orig_start;
3110 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3111
3112 /* do normalize only data requests, metadata requests
3113 do not need preallocation */
3114 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3115 return;
3116
3117 /* sometime caller may want exact blocks */
3118 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3119 return;
3120
3121 /* caller may indicate that preallocation isn't
3122 * required (it's a tail, for example) */
3123 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
3124 return;
3125
3126 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
3127 ext4_mb_normalize_group_request(ac);
3128 return ;
3129 }
3130
3131 bsbits = ac->ac_sb->s_blocksize_bits;
3132
3133 /* first, let's learn actual file size
3134 * given current request is allocated */
3135 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
3136 size = size << bsbits;
3137 if (size < i_size_read(ac->ac_inode))
3138 size = i_size_read(ac->ac_inode);
3139
3140 /* max available blocks in a free group */
3141 max = EXT4_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 -
3142 EXT4_SB(ac->ac_sb)->s_itb_per_group;
3143
3144#define NRL_CHECK_SIZE(req, size, max,bits) \
3145 (req <= (size) || max <= ((size) >> bits))
3146
3147 /* first, try to predict filesize */
3148 /* XXX: should this table be tunable? */
3149 start_off = 0;
3150 if (size <= 16 * 1024) {
3151 size = 16 * 1024;
3152 } else if (size <= 32 * 1024) {
3153 size = 32 * 1024;
3154 } else if (size <= 64 * 1024) {
3155 size = 64 * 1024;
3156 } else if (size <= 128 * 1024) {
3157 size = 128 * 1024;
3158 } else if (size <= 256 * 1024) {
3159 size = 256 * 1024;
3160 } else if (size <= 512 * 1024) {
3161 size = 512 * 1024;
3162 } else if (size <= 1024 * 1024) {
3163 size = 1024 * 1024;
3164 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) {
3165 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3166 (20 - bsbits)) << 20;
3167 size = 1024 * 1024;
3168 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) {
3169 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3170 (22 - bsbits)) << 22;
3171 size = 4 * 1024 * 1024;
3172 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
3173 (8<<20)>>bsbits, max, bsbits)) {
3174 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3175 (23 - bsbits)) << 23;
3176 size = 8 * 1024 * 1024;
3177 } else {
3178 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
3179 size = ac->ac_o_ex.fe_len << bsbits;
3180 }
3181 orig_size = size = size >> bsbits;
3182 orig_start = start = start_off >> bsbits;
3183
3184 /* don't cover already allocated blocks in selected range */
3185 if (ar->pleft && start <= ar->lleft) {
3186 size -= ar->lleft + 1 - start;
3187 start = ar->lleft + 1;
3188 }
3189 if (ar->pright && start + size - 1 >= ar->lright)
3190 size -= start + size - ar->lright;
3191
3192 end = start + size;
3193
3194 /* check we don't cross already preallocated blocks */
3195 rcu_read_lock();
3196 list_for_each_rcu(cur, &ei->i_prealloc_list) {
3197 struct ext4_prealloc_space *pa;
3198 unsigned long pa_end;
3199
3200 pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
3201
3202 if (pa->pa_deleted)
3203 continue;
3204 spin_lock(&pa->pa_lock);
3205 if (pa->pa_deleted) {
3206 spin_unlock(&pa->pa_lock);
3207 continue;
3208 }
3209
3210 pa_end = pa->pa_lstart + pa->pa_len;
3211
3212 /* PA must not overlap original request */
3213 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3214 ac->ac_o_ex.fe_logical < pa->pa_lstart));
3215
3216 /* skip PA normalized request doesn't overlap with */
3217 if (pa->pa_lstart >= end) {
3218 spin_unlock(&pa->pa_lock);
3219 continue;
3220 }
3221 if (pa_end <= start) {
3222 spin_unlock(&pa->pa_lock);
3223 continue;
3224 }
3225 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3226
3227 if (pa_end <= ac->ac_o_ex.fe_logical) {
3228 BUG_ON(pa_end < start);
3229 start = pa_end;
3230 }
3231
3232 if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3233 BUG_ON(pa->pa_lstart > end);
3234 end = pa->pa_lstart;
3235 }
3236 spin_unlock(&pa->pa_lock);
3237 }
3238 rcu_read_unlock();
3239 size = end - start;
3240
3241 /* XXX: extra loop to check we really don't overlap preallocations */
3242 rcu_read_lock();
3243 list_for_each_rcu(cur, &ei->i_prealloc_list) {
3244 struct ext4_prealloc_space *pa;
3245 unsigned long pa_end;
3246 pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
3247 spin_lock(&pa->pa_lock);
3248 if (pa->pa_deleted == 0) {
3249 pa_end = pa->pa_lstart + pa->pa_len;
3250 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3251 }
3252 spin_unlock(&pa->pa_lock);
3253 }
3254 rcu_read_unlock();
3255
3256 if (start + size <= ac->ac_o_ex.fe_logical &&
3257 start > ac->ac_o_ex.fe_logical) {
3258 printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
3259 (unsigned long) start, (unsigned long) size,
3260 (unsigned long) ac->ac_o_ex.fe_logical);
3261 }
3262 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3263 start > ac->ac_o_ex.fe_logical);
3264 BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3265
3266 /* now prepare goal request */
3267
3268 /* XXX: is it better to align blocks WRT to logical
3269 * placement or satisfy big request as is */
3270 ac->ac_g_ex.fe_logical = start;
3271 ac->ac_g_ex.fe_len = size;
3272
3273 /* define goal start in order to merge */
3274 if (ar->pright && (ar->lright == (start + size))) {
3275 /* merge to the right */
3276 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3277 &ac->ac_f_ex.fe_group,
3278 &ac->ac_f_ex.fe_start);
3279 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3280 }
3281 if (ar->pleft && (ar->lleft + 1 == start)) {
3282 /* merge to the left */
3283 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3284 &ac->ac_f_ex.fe_group,
3285 &ac->ac_f_ex.fe_start);
3286 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3287 }
3288
3289 mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
3290 (unsigned) orig_size, (unsigned) start);
3291}
3292
3293static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3294{
3295 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3296
3297 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3298 atomic_inc(&sbi->s_bal_reqs);
3299 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
3300 if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
3301 atomic_inc(&sbi->s_bal_success);
3302 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3303 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3304 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3305 atomic_inc(&sbi->s_bal_goals);
3306 if (ac->ac_found > sbi->s_mb_max_to_scan)
3307 atomic_inc(&sbi->s_bal_breaks);
3308 }
3309
3310 ext4_mb_store_history(ac);
3311}
3312
3313/*
3314 * use blocks preallocated to inode
3315 */
3316static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3317 struct ext4_prealloc_space *pa)
3318{
3319 ext4_fsblk_t start;
3320 ext4_fsblk_t end;
3321 int len;
3322
3323 /* found preallocated blocks, use them */
3324 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3325 end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
3326 len = end - start;
3327 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3328 &ac->ac_b_ex.fe_start);
3329 ac->ac_b_ex.fe_len = len;
3330 ac->ac_status = AC_STATUS_FOUND;
3331 ac->ac_pa = pa;
3332
3333 BUG_ON(start < pa->pa_pstart);
3334 BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
3335 BUG_ON(pa->pa_free < len);
3336 pa->pa_free -= len;
3337
3338 mb_debug("use %llu/%lu from inode pa %p\n", start, len, pa);
3339}
3340
3341/*
3342 * use blocks preallocated to locality group
3343 */
3344static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3345 struct ext4_prealloc_space *pa)
3346{
3347 unsigned len = ac->ac_o_ex.fe_len;
3348
3349 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3350 &ac->ac_b_ex.fe_group,
3351 &ac->ac_b_ex.fe_start);
3352 ac->ac_b_ex.fe_len = len;
3353 ac->ac_status = AC_STATUS_FOUND;
3354 ac->ac_pa = pa;
3355
3356 /* we don't correct pa_pstart or pa_plen here to avoid
3357 * possible race when tte group is being loaded concurrently
3358 * instead we correct pa later, after blocks are marked
3359 * in on-disk bitmap -- see ext4_mb_release_context() */
3360 /*
3361 * FIXME!! but the other CPUs can look at this particular
3362 * pa and think that it have enought free blocks if we
3363 * don't update pa_free here right ?
3364 */
3365 mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3366}
3367
3368/*
3369 * search goal blocks in preallocated space
3370 */
3371static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3372{
3373 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3374 struct ext4_locality_group *lg;
3375 struct ext4_prealloc_space *pa;
3376 struct list_head *cur;
3377
3378 /* only data can be preallocated */
3379 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3380 return 0;
3381
3382 /* first, try per-file preallocation */
3383 rcu_read_lock();
3384 list_for_each_rcu(cur, &ei->i_prealloc_list) {
3385 pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
3386
3387 /* all fields in this condition don't change,
3388 * so we can skip locking for them */
3389 if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3390 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
3391 continue;
3392
3393 /* found preallocated blocks, use them */
3394 spin_lock(&pa->pa_lock);
3395 if (pa->pa_deleted == 0 && pa->pa_free) {
3396 atomic_inc(&pa->pa_count);
3397 ext4_mb_use_inode_pa(ac, pa);
3398 spin_unlock(&pa->pa_lock);
3399 ac->ac_criteria = 10;
3400 rcu_read_unlock();
3401 return 1;
3402 }
3403 spin_unlock(&pa->pa_lock);
3404 }
3405 rcu_read_unlock();
3406
3407 /* can we use group allocation? */
3408 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3409 return 0;
3410
3411 /* inode may have no locality group for some reason */
3412 lg = ac->ac_lg;
3413 if (lg == NULL)
3414 return 0;
3415
3416 rcu_read_lock();
3417 list_for_each_rcu(cur, &lg->lg_prealloc_list) {
3418 pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
3419 spin_lock(&pa->pa_lock);
3420 if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
3421 atomic_inc(&pa->pa_count);
3422 ext4_mb_use_group_pa(ac, pa);
3423 spin_unlock(&pa->pa_lock);
3424 ac->ac_criteria = 20;
3425 rcu_read_unlock();
3426 return 1;
3427 }
3428 spin_unlock(&pa->pa_lock);
3429 }
3430 rcu_read_unlock();
3431
3432 return 0;
3433}
3434
3435/*
3436 * the function goes through all preallocation in this group and marks them
3437 * used in in-core bitmap. buddy must be generated from this bitmap
3438 * Need to be called with ext4 group lock (ext4_lock_group)
3439 */
3440static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3441 ext4_group_t group)
3442{
3443 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3444 struct ext4_prealloc_space *pa;
3445 struct list_head *cur;
3446 ext4_group_t groupnr;
3447 ext4_grpblk_t start;
3448 int preallocated = 0;
3449 int count = 0;
3450 int len;
3451
3452 /* all form of preallocation discards first load group,
3453 * so the only competing code is preallocation use.
3454 * we don't need any locking here
3455 * notice we do NOT ignore preallocations with pa_deleted
3456 * otherwise we could leave used blocks available for
3457 * allocation in buddy when concurrent ext4_mb_put_pa()
3458 * is dropping preallocation
3459 */
3460 list_for_each(cur, &grp->bb_prealloc_list) {
3461 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3462 spin_lock(&pa->pa_lock);
3463 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3464 &groupnr, &start);
3465 len = pa->pa_len;
3466 spin_unlock(&pa->pa_lock);
3467 if (unlikely(len == 0))
3468 continue;
3469 BUG_ON(groupnr != group);
3470 mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
3471 bitmap, start, len);
3472 preallocated += len;
3473 count++;
3474 }
3475 mb_debug("prellocated %u for group %lu\n", preallocated, group);
3476}
3477
3478static void ext4_mb_pa_callback(struct rcu_head *head)
3479{
3480 struct ext4_prealloc_space *pa;
3481 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3482 kmem_cache_free(ext4_pspace_cachep, pa);
3483}
3484
3485/*
3486 * drops a reference to preallocated space descriptor
3487 * if this was the last reference and the space is consumed
3488 */
3489static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3490 struct super_block *sb, struct ext4_prealloc_space *pa)
3491{
3492 unsigned long grp;
3493
3494 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
3495 return;
3496
3497 /* in this short window concurrent discard can set pa_deleted */
3498 spin_lock(&pa->pa_lock);
3499 if (pa->pa_deleted == 1) {
3500 spin_unlock(&pa->pa_lock);
3501 return;
3502 }
3503
3504 pa->pa_deleted = 1;
3505 spin_unlock(&pa->pa_lock);
3506
3507 /* -1 is to protect from crossing allocation group */
3508 ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL);
3509
3510 /*
3511 * possible race:
3512 *
3513 * P1 (buddy init) P2 (regular allocation)
3514 * find block B in PA
3515 * copy on-disk bitmap to buddy
3516 * mark B in on-disk bitmap
3517 * drop PA from group
3518 * mark all PAs in buddy
3519 *
3520 * thus, P1 initializes buddy with B available. to prevent this
3521 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
3522 * against that pair
3523 */
3524 ext4_lock_group(sb, grp);
3525 list_del(&pa->pa_group_list);
3526 ext4_unlock_group(sb, grp);
3527
3528 spin_lock(pa->pa_obj_lock);
3529 list_del_rcu(&pa->pa_inode_list);
3530 spin_unlock(pa->pa_obj_lock);
3531
3532 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3533}
3534
3535/*
3536 * creates new preallocated space for given inode
3537 */
3538static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3539{
3540 struct super_block *sb = ac->ac_sb;
3541 struct ext4_prealloc_space *pa;
3542 struct ext4_group_info *grp;
3543 struct ext4_inode_info *ei;
3544
3545 /* preallocate only when found space is larger then requested */
3546 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3547 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3548 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3549
3550 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3551 if (pa == NULL)
3552 return -ENOMEM;
3553
3554 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3555 int winl;
3556 int wins;
3557 int win;
3558 int offs;
3559
3560 /* we can't allocate as much as normalizer wants.
3561 * so, found space must get proper lstart
3562 * to cover original request */
3563 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
3564 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
3565
3566 /* we're limited by original request in that
3567 * logical block must be covered any way
3568 * winl is window we can move our chunk within */
3569 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3570
3571 /* also, we should cover whole original request */
3572 wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
3573
3574 /* the smallest one defines real window */
3575 win = min(winl, wins);
3576
3577 offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
3578 if (offs && offs < win)
3579 win = offs;
3580
3581 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
3582 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3583 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3584 }
3585
3586 /* preallocation can change ac_b_ex, thus we store actually
3587 * allocated blocks for history */
3588 ac->ac_f_ex = ac->ac_b_ex;
3589
3590 pa->pa_lstart = ac->ac_b_ex.fe_logical;
3591 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3592 pa->pa_len = ac->ac_b_ex.fe_len;
3593 pa->pa_free = pa->pa_len;
3594 atomic_set(&pa->pa_count, 1);
3595 spin_lock_init(&pa->pa_lock);
3596 pa->pa_deleted = 0;
3597 pa->pa_linear = 0;
3598
3599 mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
3600 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3601
3602 ext4_mb_use_inode_pa(ac, pa);
3603 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3604
3605 ei = EXT4_I(ac->ac_inode);
3606 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3607
3608 pa->pa_obj_lock = &ei->i_prealloc_lock;
3609 pa->pa_inode = ac->ac_inode;
3610
3611 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3612 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3613 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3614
3615 spin_lock(pa->pa_obj_lock);
3616 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
3617 spin_unlock(pa->pa_obj_lock);
3618
3619 return 0;
3620}
3621
3622/*
3623 * creates new preallocated space for locality group inodes belongs to
3624 */
3625static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3626{
3627 struct super_block *sb = ac->ac_sb;
3628 struct ext4_locality_group *lg;
3629 struct ext4_prealloc_space *pa;
3630 struct ext4_group_info *grp;
3631
3632 /* preallocate only when found space is larger then requested */
3633 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3634 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3635 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3636
3637 BUG_ON(ext4_pspace_cachep == NULL);
3638 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3639 if (pa == NULL)
3640 return -ENOMEM;
3641
3642 /* preallocation can change ac_b_ex, thus we store actually
3643 * allocated blocks for history */
3644 ac->ac_f_ex = ac->ac_b_ex;
3645
3646 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3647 pa->pa_lstart = pa->pa_pstart;
3648 pa->pa_len = ac->ac_b_ex.fe_len;
3649 pa->pa_free = pa->pa_len;
3650 atomic_set(&pa->pa_count, 1);
3651 spin_lock_init(&pa->pa_lock);
3652 pa->pa_deleted = 0;
3653 pa->pa_linear = 1;
3654
3655 mb_debug("new group pa %p: %llu/%u for %u\n", pa,
3656 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3657
3658 ext4_mb_use_group_pa(ac, pa);
3659 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3660
3661 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3662 lg = ac->ac_lg;
3663 BUG_ON(lg == NULL);
3664
3665 pa->pa_obj_lock = &lg->lg_prealloc_lock;
3666 pa->pa_inode = NULL;
3667
3668 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3669 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3670 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3671
3672 spin_lock(pa->pa_obj_lock);
3673 list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
3674 spin_unlock(pa->pa_obj_lock);
3675
3676 return 0;
3677}
3678
3679static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3680{
3681 int err;
3682
3683 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3684 err = ext4_mb_new_group_pa(ac);
3685 else
3686 err = ext4_mb_new_inode_pa(ac);
3687 return err;
3688}
3689
3690/*
3691 * finds all unused blocks in on-disk bitmap, frees them in
3692 * in-core bitmap and buddy.
3693 * @pa must be unlinked from inode and group lists, so that
3694 * nobody else can find/use it.
3695 * the caller MUST hold group/inode locks.
3696 * TODO: optimize the case when there are no in-core structures yet
3697 */
3698static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
3699 struct buffer_head *bitmap_bh,
3700 struct ext4_prealloc_space *pa)
3701{
3702 struct ext4_allocation_context ac;
3703 struct super_block *sb = e4b->bd_sb;
3704 struct ext4_sb_info *sbi = EXT4_SB(sb);
3705 unsigned long end;
3706 unsigned long next;
3707 ext4_group_t group;
3708 ext4_grpblk_t bit;
3709 sector_t start;
3710 int err = 0;
3711 int free = 0;
3712
3713 BUG_ON(pa->pa_deleted == 0);
3714 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3715 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3716 end = bit + pa->pa_len;
3717
3718 ac.ac_sb = sb;
3719 ac.ac_inode = pa->pa_inode;
3720 ac.ac_op = EXT4_MB_HISTORY_DISCARD;
3721
3722 while (bit < end) {
3723 bit = ext4_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3724 if (bit >= end)
3725 break;
3726 next = ext4_find_next_bit(bitmap_bh->b_data, end, bit);
3727 if (next > end)
3728 next = end;
3729 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
3730 le32_to_cpu(sbi->s_es->s_first_data_block);
3731 mb_debug(" free preallocated %u/%u in group %u\n",
3732 (unsigned) start, (unsigned) next - bit,
3733 (unsigned) group);
3734 free += next - bit;
3735
3736 ac.ac_b_ex.fe_group = group;
3737 ac.ac_b_ex.fe_start = bit;
3738 ac.ac_b_ex.fe_len = next - bit;
3739 ac.ac_b_ex.fe_logical = 0;
3740 ext4_mb_store_history(&ac);
3741
3742 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3743 bit = next + 1;
3744 }
3745 if (free != pa->pa_free) {
3746 printk(KERN_ERR "pa %p: logic %lu, phys. %lu, len %lu\n",
3747 pa, (unsigned long) pa->pa_lstart,
3748 (unsigned long) pa->pa_pstart,
3749 (unsigned long) pa->pa_len);
3750 printk(KERN_ERR "free %u, pa_free %u\n", free, pa->pa_free);
3751 }
3752 BUG_ON(free != pa->pa_free);
3753 atomic_add(free, &sbi->s_mb_discarded);
3754
3755 return err;
3756}
3757
3758static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3759 struct ext4_prealloc_space *pa)
3760{
3761 struct ext4_allocation_context ac;
3762 struct super_block *sb = e4b->bd_sb;
3763 ext4_group_t group;
3764 ext4_grpblk_t bit;
3765
3766 ac.ac_op = EXT4_MB_HISTORY_DISCARD;
3767
3768 BUG_ON(pa->pa_deleted == 0);
3769 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3770 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3771 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3772 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3773
3774 ac.ac_sb = sb;
3775 ac.ac_inode = NULL;
3776 ac.ac_b_ex.fe_group = group;
3777 ac.ac_b_ex.fe_start = bit;
3778 ac.ac_b_ex.fe_len = pa->pa_len;
3779 ac.ac_b_ex.fe_logical = 0;
3780 ext4_mb_store_history(&ac);
3781
3782 return 0;
3783}
3784
3785/*
3786 * releases all preallocations in given group
3787 *
3788 * first, we need to decide discard policy:
3789 * - when do we discard
3790 * 1) ENOSPC
3791 * - how many do we discard
3792 * 1) how many requested
3793 */
3794static int ext4_mb_discard_group_preallocations(struct super_block *sb,
3795 ext4_group_t group, int needed)
3796{
3797 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3798 struct buffer_head *bitmap_bh = NULL;
3799 struct ext4_prealloc_space *pa, *tmp;
3800 struct list_head list;
3801 struct ext4_buddy e4b;
3802 int err;
3803 int busy = 0;
3804 int free = 0;
3805
3806 mb_debug("discard preallocation for group %lu\n", group);
3807
3808 if (list_empty(&grp->bb_prealloc_list))
3809 return 0;
3810
3811 bitmap_bh = read_block_bitmap(sb, group);
3812 if (bitmap_bh == NULL) {
3813 /* error handling here */
3814 ext4_mb_release_desc(&e4b);
3815 BUG_ON(bitmap_bh == NULL);
3816 }
3817
3818 err = ext4_mb_load_buddy(sb, group, &e4b);
3819 BUG_ON(err != 0); /* error handling here */
3820
3821 if (needed == 0)
3822 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
3823
3824 grp = ext4_get_group_info(sb, group);
3825 INIT_LIST_HEAD(&list);
3826
3827repeat:
3828 ext4_lock_group(sb, group);
3829 list_for_each_entry_safe(pa, tmp,
3830 &grp->bb_prealloc_list, pa_group_list) {
3831 spin_lock(&pa->pa_lock);
3832 if (atomic_read(&pa->pa_count)) {
3833 spin_unlock(&pa->pa_lock);
3834 busy = 1;
3835 continue;
3836 }
3837 if (pa->pa_deleted) {
3838 spin_unlock(&pa->pa_lock);
3839 continue;
3840 }
3841
3842 /* seems this one can be freed ... */
3843 pa->pa_deleted = 1;
3844
3845 /* we can trust pa_free ... */
3846 free += pa->pa_free;
3847
3848 spin_unlock(&pa->pa_lock);
3849
3850 list_del(&pa->pa_group_list);
3851 list_add(&pa->u.pa_tmp_list, &list);
3852 }
3853
3854 /* if we still need more blocks and some PAs were used, try again */
3855 if (free < needed && busy) {
3856 busy = 0;
3857 ext4_unlock_group(sb, group);
3858 /*
3859 * Yield the CPU here so that we don't get soft lockup
3860 * in non preempt case.
3861 */
3862 yield();
3863 goto repeat;
3864 }
3865
3866 /* found anything to free? */
3867 if (list_empty(&list)) {
3868 BUG_ON(free != 0);
3869 goto out;
3870 }
3871
3872 /* now free all selected PAs */
3873 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3874
3875 /* remove from object (inode or locality group) */
3876 spin_lock(pa->pa_obj_lock);
3877 list_del_rcu(&pa->pa_inode_list);
3878 spin_unlock(pa->pa_obj_lock);
3879
3880 if (pa->pa_linear)
3881 ext4_mb_release_group_pa(&e4b, pa);
3882 else
3883 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3884
3885 list_del(&pa->u.pa_tmp_list);
3886 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3887 }
3888
3889out:
3890 ext4_unlock_group(sb, group);
3891 ext4_mb_release_desc(&e4b);
3892 put_bh(bitmap_bh);
3893 return free;
3894}
3895
3896/*
3897 * releases all non-used preallocated blocks for given inode
3898 *
3899 * It's important to discard preallocations under i_data_sem
3900 * We don't want another block to be served from the prealloc
3901 * space when we are discarding the inode prealloc space.
3902 *
3903 * FIXME!! Make sure it is valid at all the call sites
3904 */
3905void ext4_mb_discard_inode_preallocations(struct inode *inode)
3906{
3907 struct ext4_inode_info *ei = EXT4_I(inode);
3908 struct super_block *sb = inode->i_sb;
3909 struct buffer_head *bitmap_bh = NULL;
3910 struct ext4_prealloc_space *pa, *tmp;
3911 ext4_group_t group = 0;
3912 struct list_head list;
3913 struct ext4_buddy e4b;
3914 int err;
3915
3916 if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
3917 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3918 return;
3919 }
3920
3921 mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
3922
3923 INIT_LIST_HEAD(&list);
3924
3925repeat:
3926 /* first, collect all pa's in the inode */
3927 spin_lock(&ei->i_prealloc_lock);
3928 while (!list_empty(&ei->i_prealloc_list)) {
3929 pa = list_entry(ei->i_prealloc_list.next,
3930 struct ext4_prealloc_space, pa_inode_list);
3931 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
3932 spin_lock(&pa->pa_lock);
3933 if (atomic_read(&pa->pa_count)) {
3934 /* this shouldn't happen often - nobody should
3935 * use preallocation while we're discarding it */
3936 spin_unlock(&pa->pa_lock);
3937 spin_unlock(&ei->i_prealloc_lock);
3938 printk(KERN_ERR "uh-oh! used pa while discarding\n");
3939 WARN_ON(1);
3940 schedule_timeout_uninterruptible(HZ);
3941 goto repeat;
3942
3943 }
3944 if (pa->pa_deleted == 0) {
3945 pa->pa_deleted = 1;
3946 spin_unlock(&pa->pa_lock);
3947 list_del_rcu(&pa->pa_inode_list);
3948 list_add(&pa->u.pa_tmp_list, &list);
3949 continue;
3950 }
3951
3952 /* someone is deleting pa right now */
3953 spin_unlock(&pa->pa_lock);
3954 spin_unlock(&ei->i_prealloc_lock);
3955
3956 /* we have to wait here because pa_deleted
3957 * doesn't mean pa is already unlinked from
3958 * the list. as we might be called from
3959 * ->clear_inode() the inode will get freed
3960 * and concurrent thread which is unlinking
3961 * pa from inode's list may access already
3962 * freed memory, bad-bad-bad */
3963
3964 /* XXX: if this happens too often, we can
3965 * add a flag to force wait only in case
3966 * of ->clear_inode(), but not in case of
3967 * regular truncate */
3968 schedule_timeout_uninterruptible(HZ);
3969 goto repeat;
3970 }
3971 spin_unlock(&ei->i_prealloc_lock);
3972
3973 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3974 BUG_ON(pa->pa_linear != 0);
3975 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
3976
3977 err = ext4_mb_load_buddy(sb, group, &e4b);
3978 BUG_ON(err != 0); /* error handling here */
3979
3980 bitmap_bh = read_block_bitmap(sb, group);
3981 if (bitmap_bh == NULL) {
3982 /* error handling here */
3983 ext4_mb_release_desc(&e4b);
3984 BUG_ON(bitmap_bh == NULL);
3985 }
3986
3987 ext4_lock_group(sb, group);
3988 list_del(&pa->pa_group_list);
3989 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3990 ext4_unlock_group(sb, group);
3991
3992 ext4_mb_release_desc(&e4b);
3993 put_bh(bitmap_bh);
3994
3995 list_del(&pa->u.pa_tmp_list);
3996 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3997 }
3998}
3999
4000/*
4001 * finds all preallocated spaces and return blocks being freed to them
4002 * if preallocated space becomes full (no block is used from the space)
4003 * then the function frees space in buddy
4004 * XXX: at the moment, truncate (which is the only way to free blocks)
4005 * discards all preallocations
4006 */
4007static void ext4_mb_return_to_preallocation(struct inode *inode,
4008 struct ext4_buddy *e4b,
4009 sector_t block, int count)
4010{
4011 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
4012}
4013#ifdef MB_DEBUG
4014static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4015{
4016 struct super_block *sb = ac->ac_sb;
4017 ext4_group_t i;
4018
4019 printk(KERN_ERR "EXT4-fs: Can't allocate:"
4020 " Allocation context details:\n");
4021 printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
4022 ac->ac_status, ac->ac_flags);
4023 printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
4024 "best %lu/%lu/%lu@%lu cr %d\n",
4025 (unsigned long)ac->ac_o_ex.fe_group,
4026 (unsigned long)ac->ac_o_ex.fe_start,
4027 (unsigned long)ac->ac_o_ex.fe_len,
4028 (unsigned long)ac->ac_o_ex.fe_logical,
4029 (unsigned long)ac->ac_g_ex.fe_group,
4030 (unsigned long)ac->ac_g_ex.fe_start,
4031 (unsigned long)ac->ac_g_ex.fe_len,
4032 (unsigned long)ac->ac_g_ex.fe_logical,
4033 (unsigned long)ac->ac_b_ex.fe_group,
4034 (unsigned long)ac->ac_b_ex.fe_start,
4035 (unsigned long)ac->ac_b_ex.fe_len,
4036 (unsigned long)ac->ac_b_ex.fe_logical,
4037 (int)ac->ac_criteria);
4038 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
4039 ac->ac_found);
4040 printk(KERN_ERR "EXT4-fs: groups: \n");
4041 for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
4042 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4043 struct ext4_prealloc_space *pa;
4044 ext4_grpblk_t start;
4045 struct list_head *cur;
4046 ext4_lock_group(sb, i);
4047 list_for_each(cur, &grp->bb_prealloc_list) {
4048 pa = list_entry(cur, struct ext4_prealloc_space,
4049 pa_group_list);
4050 spin_lock(&pa->pa_lock);
4051 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4052 NULL, &start);
4053 spin_unlock(&pa->pa_lock);
4054 printk(KERN_ERR "PA:%lu:%d:%u \n", i,
4055 start, pa->pa_len);
4056 }
4057 ext4_lock_group(sb, i);
4058
4059 if (grp->bb_free == 0)
4060 continue;
4061 printk(KERN_ERR "%lu: %d/%d \n",
4062 i, grp->bb_free, grp->bb_fragments);
4063 }
4064 printk(KERN_ERR "\n");
4065}
4066#else
4067static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4068{
4069 return;
4070}
4071#endif
4072
4073/*
4074 * We use locality group preallocation for small size file. The size of the
4075 * file is determined by the current size or the resulting size after
4076 * allocation which ever is larger
4077 *
4078 * One can tune this size via /proc/fs/ext4/<partition>/stream_req
4079 */
4080static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4081{
4082 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4083 int bsbits = ac->ac_sb->s_blocksize_bits;
4084 loff_t size, isize;
4085
4086 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4087 return;
4088
4089 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4090 isize = i_size_read(ac->ac_inode) >> bsbits;
4091 size = max(size, isize);
4092
4093 /* don't use group allocation for large files */
4094 if (size >= sbi->s_mb_stream_request)
4095 return;
4096
4097 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4098 return;
4099
4100 BUG_ON(ac->ac_lg != NULL);
4101 /*
4102 * locality group prealloc space are per cpu. The reason for having
4103 * per cpu locality group is to reduce the contention between block
4104 * request from multiple CPUs.
4105 */
4106 ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
4107 put_cpu();
4108
4109 /* we're going to use group allocation */
4110 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
4111
4112 /* serialize all allocations in the group */
4113 mutex_lock(&ac->ac_lg->lg_mutex);
4114}
4115
4116static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4117 struct ext4_allocation_request *ar)
4118{
4119 struct super_block *sb = ar->inode->i_sb;
4120 struct ext4_sb_info *sbi = EXT4_SB(sb);
4121 struct ext4_super_block *es = sbi->s_es;
4122 ext4_group_t group;
4123 unsigned long len;
4124 unsigned long goal;
4125 ext4_grpblk_t block;
4126
4127 /* we can't allocate > group size */
4128 len = ar->len;
4129
4130 /* just a dirty hack to filter too big requests */
4131 if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
4132 len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
4133
4134 /* start searching from the goal */
4135 goal = ar->goal;
4136 if (goal < le32_to_cpu(es->s_first_data_block) ||
4137 goal >= ext4_blocks_count(es))
4138 goal = le32_to_cpu(es->s_first_data_block);
4139 ext4_get_group_no_and_offset(sb, goal, &group, &block);
4140
4141 /* set up allocation goals */
4142 ac->ac_b_ex.fe_logical = ar->logical;
4143 ac->ac_b_ex.fe_group = 0;
4144 ac->ac_b_ex.fe_start = 0;
4145 ac->ac_b_ex.fe_len = 0;
4146 ac->ac_status = AC_STATUS_CONTINUE;
4147 ac->ac_groups_scanned = 0;
4148 ac->ac_ex_scanned = 0;
4149 ac->ac_found = 0;
4150 ac->ac_sb = sb;
4151 ac->ac_inode = ar->inode;
4152 ac->ac_o_ex.fe_logical = ar->logical;
4153 ac->ac_o_ex.fe_group = group;
4154 ac->ac_o_ex.fe_start = block;
4155 ac->ac_o_ex.fe_len = len;
4156 ac->ac_g_ex.fe_logical = ar->logical;
4157 ac->ac_g_ex.fe_group = group;
4158 ac->ac_g_ex.fe_start = block;
4159 ac->ac_g_ex.fe_len = len;
4160 ac->ac_f_ex.fe_len = 0;
4161 ac->ac_flags = ar->flags;
4162 ac->ac_2order = 0;
4163 ac->ac_criteria = 0;
4164 ac->ac_pa = NULL;
4165 ac->ac_bitmap_page = NULL;
4166 ac->ac_buddy_page = NULL;
4167 ac->ac_lg = NULL;
4168
4169 /* we have to define context: we'll we work with a file or
4170 * locality group. this is a policy, actually */
4171 ext4_mb_group_or_file(ac);
4172
4173 mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4174 "left: %u/%u, right %u/%u to %swritable\n",
4175 (unsigned) ar->len, (unsigned) ar->logical,
4176 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4177 (unsigned) ar->lleft, (unsigned) ar->pleft,
4178 (unsigned) ar->lright, (unsigned) ar->pright,
4179 atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4180 return 0;
4181
4182}
4183
4184/*
4185 * release all resource we used in allocation
4186 */
4187static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4188{
4189 if (ac->ac_pa) {
4190 if (ac->ac_pa->pa_linear) {
4191 /* see comment in ext4_mb_use_group_pa() */
4192 spin_lock(&ac->ac_pa->pa_lock);
4193 ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
4194 ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
4195 ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
4196 ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
4197 spin_unlock(&ac->ac_pa->pa_lock);
4198 }
4199 ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
4200 }
4201 if (ac->ac_bitmap_page)
4202 page_cache_release(ac->ac_bitmap_page);
4203 if (ac->ac_buddy_page)
4204 page_cache_release(ac->ac_buddy_page);
4205 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4206 mutex_unlock(&ac->ac_lg->lg_mutex);
4207 ext4_mb_collect_stats(ac);
4208 return 0;
4209}
4210
4211static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4212{
4213 ext4_group_t i;
4214 int ret;
4215 int freed = 0;
4216
4217 for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
4218 ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4219 freed += ret;
4220 needed -= ret;
4221 }
4222
4223 return freed;
4224}
4225
4226/*
4227 * Main entry point into mballoc to allocate blocks
4228 * it tries to use preallocation first, then falls back
4229 * to usual allocation
4230 */
4231ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4232 struct ext4_allocation_request *ar, int *errp)
4233{
4234 struct ext4_allocation_context ac;
4235 struct ext4_sb_info *sbi;
4236 struct super_block *sb;
4237 ext4_fsblk_t block = 0;
4238 int freed;
4239 int inquota;
4240
4241 sb = ar->inode->i_sb;
4242 sbi = EXT4_SB(sb);
4243
4244 if (!test_opt(sb, MBALLOC)) {
4245 block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
4246 &(ar->len), errp);
4247 return block;
4248 }
4249
4250 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
4251 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4252 ar->len--;
4253 }
4254 if (ar->len == 0) {
4255 *errp = -EDQUOT;
4256 return 0;
4257 }
4258 inquota = ar->len;
4259
4260 ext4_mb_poll_new_transaction(sb, handle);
4261
4262 *errp = ext4_mb_initialize_context(&ac, ar);
4263 if (*errp) {
4264 ar->len = 0;
4265 goto out;
4266 }
4267
4268 ac.ac_op = EXT4_MB_HISTORY_PREALLOC;
4269 if (!ext4_mb_use_preallocated(&ac)) {
4270
4271 ac.ac_op = EXT4_MB_HISTORY_ALLOC;
4272 ext4_mb_normalize_request(&ac, ar);
4273
4274repeat:
4275 /* allocate space in core */
4276 ext4_mb_regular_allocator(&ac);
4277
4278 /* as we've just preallocated more space than
4279 * user requested orinally, we store allocated
4280 * space in a special descriptor */
4281 if (ac.ac_status == AC_STATUS_FOUND &&
4282 ac.ac_o_ex.fe_len < ac.ac_b_ex.fe_len)
4283 ext4_mb_new_preallocation(&ac);
4284 }
4285
4286 if (likely(ac.ac_status == AC_STATUS_FOUND)) {
4287 ext4_mb_mark_diskspace_used(&ac, handle);
4288 *errp = 0;
4289 block = ext4_grp_offs_to_block(sb, &ac.ac_b_ex);
4290 ar->len = ac.ac_b_ex.fe_len;
4291 } else {
4292 freed = ext4_mb_discard_preallocations(sb, ac.ac_o_ex.fe_len);
4293 if (freed)
4294 goto repeat;
4295 *errp = -ENOSPC;
4296 ac.ac_b_ex.fe_len = 0;
4297 ar->len = 0;
4298 ext4_mb_show_ac(&ac);
4299 }
4300
4301 ext4_mb_release_context(&ac);
4302
4303out:
4304 if (ar->len < inquota)
4305 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
4306
4307 return block;
4308}
4309static void ext4_mb_poll_new_transaction(struct super_block *sb,
4310 handle_t *handle)
4311{
4312 struct ext4_sb_info *sbi = EXT4_SB(sb);
4313
4314 if (sbi->s_last_transaction == handle->h_transaction->t_tid)
4315 return;
4316
4317 /* new transaction! time to close last one and free blocks for
4318 * committed transaction. we know that only transaction can be
4319 * active, so previos transaction can be being logged and we
4320 * know that transaction before previous is known to be already
4321 * logged. this means that now we may free blocks freed in all
4322 * transactions before previous one. hope I'm clear enough ... */
4323
4324 spin_lock(&sbi->s_md_lock);
4325 if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
4326 mb_debug("new transaction %lu, old %lu\n",
4327 (unsigned long) handle->h_transaction->t_tid,
4328 (unsigned long) sbi->s_last_transaction);
4329 list_splice_init(&sbi->s_closed_transaction,
4330 &sbi->s_committed_transaction);
4331 list_splice_init(&sbi->s_active_transaction,
4332 &sbi->s_closed_transaction);
4333 sbi->s_last_transaction = handle->h_transaction->t_tid;
4334 }
4335 spin_unlock(&sbi->s_md_lock);
4336
4337 ext4_mb_free_committed_blocks(sb);
4338}
4339
4340static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4341 ext4_group_t group, ext4_grpblk_t block, int count)
4342{
4343 struct ext4_group_info *db = e4b->bd_info;
4344 struct super_block *sb = e4b->bd_sb;
4345 struct ext4_sb_info *sbi = EXT4_SB(sb);
4346 struct ext4_free_metadata *md;
4347 int i;
4348
4349 BUG_ON(e4b->bd_bitmap_page == NULL);
4350 BUG_ON(e4b->bd_buddy_page == NULL);
4351
4352 ext4_lock_group(sb, group);
4353 for (i = 0; i < count; i++) {
4354 md = db->bb_md_cur;
4355 if (md && db->bb_tid != handle->h_transaction->t_tid) {
4356 db->bb_md_cur = NULL;
4357 md = NULL;
4358 }
4359
4360 if (md == NULL) {
4361 ext4_unlock_group(sb, group);
4362 md = kmalloc(sizeof(*md), GFP_NOFS);
4363 if (md == NULL)
4364 return -ENOMEM;
4365 md->num = 0;
4366 md->group = group;
4367
4368 ext4_lock_group(sb, group);
4369 if (db->bb_md_cur == NULL) {
4370 spin_lock(&sbi->s_md_lock);
4371 list_add(&md->list, &sbi->s_active_transaction);
4372 spin_unlock(&sbi->s_md_lock);
4373 /* protect buddy cache from being freed,
4374 * otherwise we'll refresh it from
4375 * on-disk bitmap and lose not-yet-available
4376 * blocks */
4377 page_cache_get(e4b->bd_buddy_page);
4378 page_cache_get(e4b->bd_bitmap_page);
4379 db->bb_md_cur = md;
4380 db->bb_tid = handle->h_transaction->t_tid;
4381 mb_debug("new md 0x%p for group %lu\n",
4382 md, md->group);
4383 } else {
4384 kfree(md);
4385 md = db->bb_md_cur;
4386 }
4387 }
4388
4389 BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
4390 md->blocks[md->num] = block + i;
4391 md->num++;
4392 if (md->num == EXT4_BB_MAX_BLOCKS) {
4393 /* no more space, put full container on a sb's list */
4394 db->bb_md_cur = NULL;
4395 }
4396 }
4397 ext4_unlock_group(sb, group);
4398 return 0;
4399}
4400
4401/*
4402 * Main entry point into mballoc to free blocks
4403 */
4404void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4405 unsigned long block, unsigned long count,
4406 int metadata, unsigned long *freed)
4407{
4408 struct buffer_head *bitmap_bh = 0;
4409 struct super_block *sb = inode->i_sb;
4410 struct ext4_allocation_context ac;
4411 struct ext4_group_desc *gdp;
4412 struct ext4_super_block *es;
4413 unsigned long overflow;
4414 ext4_grpblk_t bit;
4415 struct buffer_head *gd_bh;
4416 ext4_group_t block_group;
4417 struct ext4_sb_info *sbi;
4418 struct ext4_buddy e4b;
4419 int err = 0;
4420 int ret;
4421
4422 *freed = 0;
4423
4424 ext4_mb_poll_new_transaction(sb, handle);
4425
4426 sbi = EXT4_SB(sb);
4427 es = EXT4_SB(sb)->s_es;
4428 if (block < le32_to_cpu(es->s_first_data_block) ||
4429 block + count < block ||
4430 block + count > ext4_blocks_count(es)) {
4431 ext4_error(sb, __FUNCTION__,
4432 "Freeing blocks not in datazone - "
4433 "block = %lu, count = %lu", block, count);
4434 goto error_return;
4435 }
4436
4437 ext4_debug("freeing block %lu\n", block);
4438
4439 ac.ac_op = EXT4_MB_HISTORY_FREE;
4440 ac.ac_inode = inode;
4441 ac.ac_sb = sb;
4442
4443do_more:
4444 overflow = 0;
4445 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4446
4447 /*
4448 * Check to see if we are freeing blocks across a group
4449 * boundary.
4450 */
4451 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4452 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
4453 count -= overflow;
4454 }
4455 bitmap_bh = read_block_bitmap(sb, block_group);
4456 if (!bitmap_bh)
4457 goto error_return;
4458 gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
4459 if (!gdp)
4460 goto error_return;
4461
4462 if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4463 in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4464 in_range(block, ext4_inode_table(sb, gdp),
4465 EXT4_SB(sb)->s_itb_per_group) ||
4466 in_range(block + count - 1, ext4_inode_table(sb, gdp),
4467 EXT4_SB(sb)->s_itb_per_group)) {
4468
4469 ext4_error(sb, __FUNCTION__,
4470 "Freeing blocks in system zone - "
4471 "Block = %lu, count = %lu", block, count);
4472 }
4473
4474 BUFFER_TRACE(bitmap_bh, "getting write access");
4475 err = ext4_journal_get_write_access(handle, bitmap_bh);
4476 if (err)
4477 goto error_return;
4478
4479 /*
4480 * We are about to modify some metadata. Call the journal APIs
4481 * to unshare ->b_data if a currently-committing transaction is
4482 * using it
4483 */
4484 BUFFER_TRACE(gd_bh, "get_write_access");
4485 err = ext4_journal_get_write_access(handle, gd_bh);
4486 if (err)
4487 goto error_return;
4488
4489 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4490 if (err)
4491 goto error_return;
4492
4493#ifdef AGGRESSIVE_CHECK
4494 {
4495 int i;
4496 for (i = 0; i < count; i++)
4497 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4498 }
4499#endif
4500 mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
4501 bit, count);
4502
4503 /* We dirtied the bitmap block */
4504 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4505 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
4506
4507 ac.ac_b_ex.fe_group = block_group;
4508 ac.ac_b_ex.fe_start = bit;
4509 ac.ac_b_ex.fe_len = count;
4510 ext4_mb_store_history(&ac);
4511
4512 if (metadata) {
4513 /* blocks being freed are metadata. these blocks shouldn't
4514 * be used until this transaction is committed */
4515 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
4516 } else {
4517 ext4_lock_group(sb, block_group);
4518 err = mb_free_blocks(inode, &e4b, bit, count);
4519 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4520 ext4_unlock_group(sb, block_group);
4521 BUG_ON(err != 0);
4522 }
4523
4524 spin_lock(sb_bgl_lock(sbi, block_group));
4525 gdp->bg_free_blocks_count =
4526 cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
4527 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
4528 spin_unlock(sb_bgl_lock(sbi, block_group));
4529 percpu_counter_add(&sbi->s_freeblocks_counter, count);
4530
4531 ext4_mb_release_desc(&e4b);
4532
4533 *freed += count;
4534
4535 /* And the group descriptor block */
4536 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4537 ret = ext4_journal_dirty_metadata(handle, gd_bh);
4538 if (!err)
4539 err = ret;
4540
4541 if (overflow && !err) {
4542 block += count;
4543 count = overflow;
4544 put_bh(bitmap_bh);
4545 goto do_more;
4546 }
4547 sb->s_dirt = 1;
4548error_return:
4549 brelse(bitmap_bh);
4550 ext4_std_error(sb, err);
4551 return;
4552}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index ec7cb567a7da..3ebc2332f52e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -236,10 +236,10 @@ static int free_dind_blocks(handle_t *handle,
236 for (i = 0; i < max_entries; i++) { 236 for (i = 0; i < max_entries; i++) {
237 if (tmp_idata[i]) 237 if (tmp_idata[i])
238 ext4_free_blocks(handle, inode, 238 ext4_free_blocks(handle, inode,
239 le32_to_cpu(tmp_idata[i]), 1); 239 le32_to_cpu(tmp_idata[i]), 1, 1);
240 } 240 }
241 put_bh(bh); 241 put_bh(bh);
242 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1); 242 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
243 return 0; 243 return 0;
244} 244}
245 245
@@ -267,7 +267,7 @@ static int free_tind_blocks(handle_t *handle,
267 } 267 }
268 } 268 }
269 put_bh(bh); 269 put_bh(bh);
270 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1); 270 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
271 return 0; 271 return 0;
272} 272}
273 273
@@ -278,7 +278,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode)
278 278
279 if (ei->i_data[EXT4_IND_BLOCK]) 279 if (ei->i_data[EXT4_IND_BLOCK])
280 ext4_free_blocks(handle, inode, 280 ext4_free_blocks(handle, inode,
281 le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1); 281 le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1, 1);
282 282
283 if (ei->i_data[EXT4_DIND_BLOCK]) { 283 if (ei->i_data[EXT4_DIND_BLOCK]) {
284 retval = free_dind_blocks(handle, inode, 284 retval = free_dind_blocks(handle, inode,
@@ -365,7 +365,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
365 } 365 }
366 } 366 }
367 put_bh(bh); 367 put_bh(bh);
368 ext4_free_blocks(handle, inode, block, 1); 368 ext4_free_blocks(handle, inode, block, 1, 1);
369 return retval; 369 return retval;
370} 370}
371 371
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 64fc7f111734..3a51ffc47790 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -503,6 +503,7 @@ static void ext4_put_super (struct super_block * sb)
503 struct ext4_super_block *es = sbi->s_es; 503 struct ext4_super_block *es = sbi->s_es;
504 int i; 504 int i;
505 505
506 ext4_mb_release(sb);
506 ext4_ext_release(sb); 507 ext4_ext_release(sb);
507 ext4_xattr_put_super(sb); 508 ext4_xattr_put_super(sb);
508 jbd2_journal_destroy(sbi->s_journal); 509 jbd2_journal_destroy(sbi->s_journal);
@@ -569,6 +570,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
569 ei->i_block_alloc_info = NULL; 570 ei->i_block_alloc_info = NULL;
570 ei->vfs_inode.i_version = 1; 571 ei->vfs_inode.i_version = 1;
571 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 572 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
573 INIT_LIST_HEAD(&ei->i_prealloc_list);
574 spin_lock_init(&ei->i_prealloc_lock);
572 return &ei->vfs_inode; 575 return &ei->vfs_inode;
573} 576}
574 577
@@ -881,6 +884,7 @@ enum {
881 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 884 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
882 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, 885 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
883 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, 886 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
887 Opt_mballoc, Opt_nomballoc, Opt_stripe,
884}; 888};
885 889
886static match_table_t tokens = { 890static match_table_t tokens = {
@@ -935,6 +939,9 @@ static match_table_t tokens = {
935 {Opt_extents, "extents"}, 939 {Opt_extents, "extents"},
936 {Opt_noextents, "noextents"}, 940 {Opt_noextents, "noextents"},
937 {Opt_i_version, "i_version"}, 941 {Opt_i_version, "i_version"},
942 {Opt_mballoc, "mballoc"},
943 {Opt_nomballoc, "nomballoc"},
944 {Opt_stripe, "stripe=%u"},
938 {Opt_err, NULL}, 945 {Opt_err, NULL},
939 {Opt_resize, "resize"}, 946 {Opt_resize, "resize"},
940}; 947};
@@ -1284,6 +1291,19 @@ clear_qf_name:
1284 set_opt(sbi->s_mount_opt, I_VERSION); 1291 set_opt(sbi->s_mount_opt, I_VERSION);
1285 sb->s_flags |= MS_I_VERSION; 1292 sb->s_flags |= MS_I_VERSION;
1286 break; 1293 break;
1294 case Opt_mballoc:
1295 set_opt(sbi->s_mount_opt, MBALLOC);
1296 break;
1297 case Opt_nomballoc:
1298 clear_opt(sbi->s_mount_opt, MBALLOC);
1299 break;
1300 case Opt_stripe:
1301 if (match_int(&args[0], &option))
1302 return 0;
1303 if (option < 0)
1304 return 0;
1305 sbi->s_stripe = option;
1306 break;
1287 default: 1307 default:
1288 printk (KERN_ERR 1308 printk (KERN_ERR
1289 "EXT4-fs: Unrecognized mount option \"%s\" " 1309 "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1742,6 +1762,34 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
1742 return (has_super + ext4_group_first_block_no(sb, bg)); 1762 return (has_super + ext4_group_first_block_no(sb, bg));
1743} 1763}
1744 1764
1765/**
1766 * ext4_get_stripe_size: Get the stripe size.
1767 * @sbi: In memory super block info
1768 *
1769 * If we have specified it via mount option, then
1770 * use the mount option value. If the value specified at mount time is
1771 * greater than the blocks per group use the super block value.
1772 * If the super block value is greater than blocks per group return 0.
1773 * Allocator needs it be less than blocks per group.
1774 *
1775 */
1776static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
1777{
1778 unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
1779 unsigned long stripe_width =
1780 le32_to_cpu(sbi->s_es->s_raid_stripe_width);
1781
1782 if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
1783 return sbi->s_stripe;
1784
1785 if (stripe_width <= sbi->s_blocks_per_group)
1786 return stripe_width;
1787
1788 if (stride <= sbi->s_blocks_per_group)
1789 return stride;
1790
1791 return 0;
1792}
1745 1793
1746static int ext4_fill_super (struct super_block *sb, void *data, int silent) 1794static int ext4_fill_super (struct super_block *sb, void *data, int silent)
1747 __releases(kernel_sem) 1795 __releases(kernel_sem)
@@ -2091,6 +2139,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
2091 sbi->s_rsv_window_head.rsv_goal_size = 0; 2139 sbi->s_rsv_window_head.rsv_goal_size = 0;
2092 ext4_rsv_window_add(sb, &sbi->s_rsv_window_head); 2140 ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
2093 2141
2142 sbi->s_stripe = ext4_get_stripe_size(sbi);
2143
2094 /* 2144 /*
2095 * set up enough so that it can read an inode 2145 * set up enough so that it can read an inode
2096 */ 2146 */
@@ -2250,6 +2300,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
2250 "writeback"); 2300 "writeback");
2251 2301
2252 ext4_ext_init(sb); 2302 ext4_ext_init(sb);
2303 ext4_mb_init(sb, needs_recovery);
2253 2304
2254 lock_kernel(); 2305 lock_kernel();
2255 return 0; 2306 return 0;
@@ -3232,9 +3283,15 @@ static struct file_system_type ext4dev_fs_type = {
3232 3283
3233static int __init init_ext4_fs(void) 3284static int __init init_ext4_fs(void)
3234{ 3285{
3235 int err = init_ext4_xattr(); 3286 int err;
3287
3288 err = init_ext4_mballoc();
3236 if (err) 3289 if (err)
3237 return err; 3290 return err;
3291
3292 err = init_ext4_xattr();
3293 if (err)
3294 goto out2;
3238 err = init_inodecache(); 3295 err = init_inodecache();
3239 if (err) 3296 if (err)
3240 goto out1; 3297 goto out1;
@@ -3246,6 +3303,8 @@ out:
3246 destroy_inodecache(); 3303 destroy_inodecache();
3247out1: 3304out1:
3248 exit_ext4_xattr(); 3305 exit_ext4_xattr();
3306out2:
3307 exit_ext4_mballoc();
3249 return err; 3308 return err;
3250} 3309}
3251 3310
@@ -3254,6 +3313,7 @@ static void __exit exit_ext4_fs(void)
3254 unregister_filesystem(&ext4dev_fs_type); 3313 unregister_filesystem(&ext4dev_fs_type);
3255 destroy_inodecache(); 3314 destroy_inodecache();
3256 exit_ext4_xattr(); 3315 exit_ext4_xattr();
3316 exit_ext4_mballoc();
3257} 3317}
3258 3318
3259MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 3319MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 86387302c2a9..d7962139c010 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -480,7 +480,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
480 ea_bdebug(bh, "refcount now=0; freeing"); 480 ea_bdebug(bh, "refcount now=0; freeing");
481 if (ce) 481 if (ce)
482 mb_cache_entry_free(ce); 482 mb_cache_entry_free(ce);
483 ext4_free_blocks(handle, inode, bh->b_blocknr, 1); 483 ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
484 get_bh(bh); 484 get_bh(bh);
485 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 485 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
486 } else { 486 } else {
@@ -821,7 +821,7 @@ inserted:
821 new_bh = sb_getblk(sb, block); 821 new_bh = sb_getblk(sb, block);
822 if (!new_bh) { 822 if (!new_bh) {
823getblk_failed: 823getblk_failed:
824 ext4_free_blocks(handle, inode, block, 1); 824 ext4_free_blocks(handle, inode, block, 1, 1);
825 error = -EIO; 825 error = -EIO;
826 goto cleanup; 826 goto cleanup;
827 } 827 }
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index d0b7ca99b91f..1852313fc7c7 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -20,6 +20,8 @@
20#include <linux/blkdev.h> 20#include <linux/blkdev.h>
21#include <linux/magic.h> 21#include <linux/magic.h>
22 22
23#include <linux/ext4_fs_i.h>
24
23/* 25/*
24 * The second extended filesystem constants/structures 26 * The second extended filesystem constants/structures
25 */ 27 */
@@ -51,6 +53,50 @@
51#define ext4_debug(f, a...) do {} while (0) 53#define ext4_debug(f, a...) do {} while (0)
52#endif 54#endif
53 55
56#define EXT4_MULTIBLOCK_ALLOCATOR 1
57
58/* prefer goal again. length */
59#define EXT4_MB_HINT_MERGE 1
60/* blocks already reserved */
61#define EXT4_MB_HINT_RESERVED 2
62/* metadata is being allocated */
63#define EXT4_MB_HINT_METADATA 4
64/* first blocks in the file */
65#define EXT4_MB_HINT_FIRST 8
66/* search for the best chunk */
67#define EXT4_MB_HINT_BEST 16
68/* data is being allocated */
69#define EXT4_MB_HINT_DATA 32
70/* don't preallocate (for tails) */
71#define EXT4_MB_HINT_NOPREALLOC 64
72/* allocate for locality group */
73#define EXT4_MB_HINT_GROUP_ALLOC 128
74/* allocate goal blocks or none */
75#define EXT4_MB_HINT_GOAL_ONLY 256
76/* goal is meaningful */
77#define EXT4_MB_HINT_TRY_GOAL 512
78
79struct ext4_allocation_request {
80 /* target inode for block we're allocating */
81 struct inode *inode;
82 /* logical block in target inode */
83 ext4_lblk_t logical;
84 /* phys. target (a hint) */
85 ext4_fsblk_t goal;
86 /* the closest logical allocated block to the left */
87 ext4_lblk_t lleft;
88 /* phys. block for ^^^ */
89 ext4_fsblk_t pleft;
90 /* the closest logical allocated block to the right */
91 ext4_lblk_t lright;
92 /* phys. block for ^^^ */
93 ext4_fsblk_t pright;
94 /* how many blocks we want to allocate */
95 unsigned long len;
96 /* flags. see above EXT4_MB_HINT_* */
97 unsigned long flags;
98};
99
54/* 100/*
55 * Special inodes numbers 101 * Special inodes numbers
56 */ 102 */
@@ -474,6 +520,7 @@ do { \
474#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ 520#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
475#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 521#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
476#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 522#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
523#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
477/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ 524/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
478#ifndef _LINUX_EXT2_FS_H 525#ifndef _LINUX_EXT2_FS_H
479#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 526#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@@ -912,7 +959,7 @@ extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
912extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, 959extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
913 ext4_fsblk_t goal, unsigned long *count, int *errp); 960 ext4_fsblk_t goal, unsigned long *count, int *errp);
914extern void ext4_free_blocks (handle_t *handle, struct inode *inode, 961extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
915 ext4_fsblk_t block, unsigned long count); 962 ext4_fsblk_t block, unsigned long count, int metadata);
916extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, 963extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
917 ext4_fsblk_t block, unsigned long count, 964 ext4_fsblk_t block, unsigned long count,
918 unsigned long *pdquot_freed_blocks); 965 unsigned long *pdquot_freed_blocks);
@@ -950,6 +997,20 @@ extern unsigned long ext4_count_dirs (struct super_block *);
950extern void ext4_check_inodes_bitmap (struct super_block *); 997extern void ext4_check_inodes_bitmap (struct super_block *);
951extern unsigned long ext4_count_free (struct buffer_head *, unsigned); 998extern unsigned long ext4_count_free (struct buffer_head *, unsigned);
952 999
1000/* mballoc.c */
1001extern long ext4_mb_stats;
1002extern long ext4_mb_max_to_scan;
1003extern int ext4_mb_init(struct super_block *, int);
1004extern int ext4_mb_release(struct super_block *);
1005extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
1006 struct ext4_allocation_request *, int *);
1007extern int ext4_mb_reserve_blocks(struct super_block *, int);
1008extern void ext4_mb_discard_inode_preallocations(struct inode *);
1009extern int __init init_ext4_mballoc(void);
1010extern void exit_ext4_mballoc(void);
1011extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1012 unsigned long, unsigned long, int, unsigned long *);
1013
953 1014
954/* inode.c */ 1015/* inode.c */
955int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 1016int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
@@ -1080,6 +1141,19 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
1080 raw_inode->i_size_high = cpu_to_le32(i_size >> 32); 1141 raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
1081} 1142}
1082 1143
1144static inline
1145struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
1146 ext4_group_t group)
1147{
1148 struct ext4_group_info ***grp_info;
1149 long indexv, indexh;
1150 grp_info = EXT4_SB(sb)->s_group_info;
1151 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
1152 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
1153 return grp_info[indexv][indexh];
1154}
1155
1156
1083#define ext4_std_error(sb, errno) \ 1157#define ext4_std_error(sb, errno) \
1084do { \ 1158do { \
1085 if ((errno)) \ 1159 if ((errno)) \
diff --git a/include/linux/ext4_fs_i.h b/include/linux/ext4_fs_i.h
index 4377d249d378..d5508d3cf290 100644
--- a/include/linux/ext4_fs_i.h
+++ b/include/linux/ext4_fs_i.h
@@ -158,6 +158,10 @@ struct ext4_inode_info {
158 * struct timespec i_{a,c,m}time in the generic inode. 158 * struct timespec i_{a,c,m}time in the generic inode.
159 */ 159 */
160 struct timespec i_crtime; 160 struct timespec i_crtime;
161
162 /* mballoc */
163 struct list_head i_prealloc_list;
164 spinlock_t i_prealloc_lock;
161}; 165};
162 166
163#endif /* _LINUX_EXT4_FS_I */ 167#endif /* _LINUX_EXT4_FS_I */
diff --git a/include/linux/ext4_fs_sb.h b/include/linux/ext4_fs_sb.h
index 38a47ec06df9..abaae2c8cccf 100644
--- a/include/linux/ext4_fs_sb.h
+++ b/include/linux/ext4_fs_sb.h
@@ -91,6 +91,58 @@ struct ext4_sb_info {
91 unsigned long s_ext_blocks; 91 unsigned long s_ext_blocks;
92 unsigned long s_ext_extents; 92 unsigned long s_ext_extents;
93#endif 93#endif
94
95 /* for buddy allocator */
96 struct ext4_group_info ***s_group_info;
97 struct inode *s_buddy_cache;
98 long s_blocks_reserved;
99 spinlock_t s_reserve_lock;
100 struct list_head s_active_transaction;
101 struct list_head s_closed_transaction;
102 struct list_head s_committed_transaction;
103 spinlock_t s_md_lock;
104 tid_t s_last_transaction;
105 unsigned short *s_mb_offsets, *s_mb_maxs;
106
107 /* tunables */
108 unsigned long s_stripe;
109 unsigned long s_mb_stream_request;
110 unsigned long s_mb_max_to_scan;
111 unsigned long s_mb_min_to_scan;
112 unsigned long s_mb_stats;
113 unsigned long s_mb_order2_reqs;
114 unsigned long s_mb_group_prealloc;
115 /* where last allocation was done - for stream allocation */
116 unsigned long s_mb_last_group;
117 unsigned long s_mb_last_start;
118
119 /* history to debug policy */
120 struct ext4_mb_history *s_mb_history;
121 int s_mb_history_cur;
122 int s_mb_history_max;
123 int s_mb_history_num;
124 struct proc_dir_entry *s_mb_proc;
125 spinlock_t s_mb_history_lock;
126 int s_mb_history_filter;
127
128 /* stats for buddy allocator */
129 spinlock_t s_mb_pa_lock;
130 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
131 atomic_t s_bal_success; /* we found long enough chunks */
132 atomic_t s_bal_allocated; /* in blocks */
133 atomic_t s_bal_ex_scanned; /* total extents scanned */
134 atomic_t s_bal_goals; /* goal hits */
135 atomic_t s_bal_breaks; /* too long searches */
136 atomic_t s_bal_2orders; /* 2^order hits */
137 spinlock_t s_bal_lock;
138 unsigned long s_mb_buddies_generated;
139 unsigned long long s_mb_generation_time;
140 atomic_t s_mb_lost_chunks;
141 atomic_t s_mb_preallocated;
142 atomic_t s_mb_discarded;
143
144 /* locality groups */
145 struct ext4_locality_group *s_locality_groups;
94}; 146};
95 147
96#endif /* _LINUX_EXT4_FS_SB */ 148#endif /* _LINUX_EXT4_FS_SB */