aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>2008-07-11 19:27:31 -0400
committerTheodore Ts'o <tytso@mit.edu>2008-07-11 19:27:31 -0400
commitcd1aac32923a9c8adcc0ae85e33c1ca0c5855838 (patch)
tree3d55d9249ef960a7e345969404d537e36dbd9609
parent61628a3f3a37af2bf25daf8e26fd6b76a78c4f76 (diff)
ext4: Add ordered mode support for delalloc
This provides a new ordered mode implementation which gets rid of using buffer heads to enforce the ordering between metadata change with the related data chage. Instead, in the new ordering mode, it keeps track of all of the inodes touched by each transaction on a list, and when that transaction is committed, it flushes all of the dirty pages for those inodes. In addition, the new ordered mode reverses the lock ordering of the page lock and transaction lock, which provides easier support for delayed allocation. Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/ext4/inode.c30
-rw-r--r--fs/jbd2/commit.c38
2 files changed, 58 insertions, 10 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7923336ecf94..24518b57733e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2043,11 +2043,12 @@ static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
2043 return !buffer_mapped(bh) || buffer_delay(bh); 2043 return !buffer_mapped(bh) || buffer_delay(bh);
2044} 2044}
2045 2045
2046/* FIXME!! only support data=writeback mode */
2047/* 2046/*
2048 * get called vi ext4_da_writepages after taking page lock 2047 * get called vi ext4_da_writepages after taking page lock
2049 * We may end up doing block allocation here in case 2048 * We may end up doing block allocation here in case
2050 * mpage_da_map_blocks failed to allocate blocks. 2049 * mpage_da_map_blocks failed to allocate blocks.
2050 *
2051 * We also get called via journal_submit_inode_data_buffers
2051 */ 2052 */
2052static int ext4_da_writepage(struct page *page, 2053static int ext4_da_writepage(struct page *page,
2053 struct writeback_control *wbc) 2054 struct writeback_control *wbc)
@@ -2066,6 +2067,7 @@ static int ext4_da_writepage(struct page *page,
2066 * ext4_da_writepages() but directly (shrink_page_list). 2067 * ext4_da_writepages() but directly (shrink_page_list).
2067 * We cannot easily start a transaction here so we just skip 2068 * We cannot easily start a transaction here so we just skip
2068 * writing the page in case we would have to do so. 2069 * writing the page in case we would have to do so.
2070 * We reach here also via journal_submit_inode_data_buffers
2069 */ 2071 */
2070 size = i_size_read(inode); 2072 size = i_size_read(inode);
2071 2073
@@ -2081,8 +2083,11 @@ static int ext4_da_writepage(struct page *page,
2081 * We can't do block allocation under 2083 * We can't do block allocation under
2082 * page lock without a handle . So redirty 2084 * page lock without a handle . So redirty
2083 * the page and return 2085 * the page and return
2086 * We may reach here when we do a journal commit
2087 * via journal_submit_inode_data_buffers.
2088 * If we don't have mapping block we just ignore
2089 * them
2084 */ 2090 */
2085 BUG_ON(wbc->sync_mode != WB_SYNC_NONE);
2086 redirty_page_for_writepage(wbc, page); 2091 redirty_page_for_writepage(wbc, page);
2087 unlock_page(page); 2092 unlock_page(page);
2088 return 0; 2093 return 0;
@@ -2097,7 +2102,6 @@ static int ext4_da_writepage(struct page *page,
2097 return ret; 2102 return ret;
2098} 2103}
2099 2104
2100
2101/* 2105/*
2102 * For now just follow the DIO way to estimate the max credits 2106 * For now just follow the DIO way to estimate the max credits
2103 * needed to write out EXT4_MAX_WRITEBACK_PAGES. 2107 * needed to write out EXT4_MAX_WRITEBACK_PAGES.
@@ -2130,7 +2134,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2130 return 0; 2134 return 0;
2131 2135
2132 /* 2136 /*
2133 * Estimate the worse case needed credits to write out 2137 * Estimate the worse case needed credits to write out
2134 * EXT4_MAX_BUF_BLOCKS pages 2138 * EXT4_MAX_BUF_BLOCKS pages
2135 */ 2139 */
2136 needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; 2140 needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
@@ -2152,6 +2156,19 @@ static int ext4_da_writepages(struct address_space *mapping,
2152 ret = PTR_ERR(handle); 2156 ret = PTR_ERR(handle);
2153 goto out_writepages; 2157 goto out_writepages;
2154 } 2158 }
2159 if (ext4_should_order_data(inode)) {
2160 /*
2161 * With ordered mode we need to add
2162 * the inode to the journal handle
2163 * when we do block allocation.
2164 */
2165 ret = ext4_jbd2_file_inode(handle, inode);
2166 if (ret) {
2167 ext4_journal_stop(handle);
2168 goto out_writepages;
2169 }
2170
2171 }
2155 /* 2172 /*
2156 * set the max dirty pages could be write at a time 2173 * set the max dirty pages could be write at a time
2157 * to fit into the reserved transaction credits 2174 * to fit into the reserved transaction credits
@@ -2735,7 +2752,10 @@ static const struct address_space_operations ext4_da_aops = {
2735 2752
2736void ext4_set_aops(struct inode *inode) 2753void ext4_set_aops(struct inode *inode)
2737{ 2754{
2738 if (ext4_should_order_data(inode)) 2755 if (ext4_should_order_data(inode) &&
2756 test_opt(inode->i_sb, DELALLOC))
2757 inode->i_mapping->a_ops = &ext4_da_aops;
2758 else if (ext4_should_order_data(inode))
2739 inode->i_mapping->a_ops = &ext4_ordered_aops; 2759 inode->i_mapping->a_ops = &ext4_ordered_aops;
2740 else if (ext4_should_writeback_data(inode) && 2760 else if (ext4_should_writeback_data(inode) &&
2741 test_opt(inode->i_sb, DELALLOC)) 2761 test_opt(inode->i_sb, DELALLOC))
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 483183d15ed5..f8b3be873226 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/crc32.h> 24#include <linux/crc32.h>
25#include <linux/writeback.h>
26#include <linux/backing-dev.h>
25 27
26/* 28/*
27 * Default IO end handler for temporary BJ_IO buffer_heads. 29 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -185,6 +187,27 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
185} 187}
186 188
187/* 189/*
190 * write the filemap data using writepage() address_space_operations.
191 * We don't do block allocation here even for delalloc. We don't
192 * use writepages() because with dealyed allocation we may be doing
193 * block allocation in writepages().
194 */
195static int journal_submit_inode_data_buffers(struct address_space *mapping)
196{
197 int ret;
198 struct writeback_control wbc = {
199 .sync_mode = WB_SYNC_ALL,
200 .nr_to_write = mapping->nrpages * 2,
201 .range_start = 0,
202 .range_end = i_size_read(mapping->host),
203 .for_writepages = 1,
204 };
205
206 ret = generic_writepages(mapping, &wbc);
207 return ret;
208}
209
210/*
188 * Submit all the data buffers of inode associated with the transaction to 211 * Submit all the data buffers of inode associated with the transaction to
189 * disk. 212 * disk.
190 * 213 *
@@ -192,7 +215,7 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
192 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently 215 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
193 * operate on from being released while we write out pages. 216 * operate on from being released while we write out pages.
194 */ 217 */
195static int journal_submit_inode_data_buffers(journal_t *journal, 218static int journal_submit_data_buffers(journal_t *journal,
196 transaction_t *commit_transaction) 219 transaction_t *commit_transaction)
197{ 220{
198 struct jbd2_inode *jinode; 221 struct jbd2_inode *jinode;
@@ -204,8 +227,13 @@ static int journal_submit_inode_data_buffers(journal_t *journal,
204 mapping = jinode->i_vfs_inode->i_mapping; 227 mapping = jinode->i_vfs_inode->i_mapping;
205 jinode->i_flags |= JI_COMMIT_RUNNING; 228 jinode->i_flags |= JI_COMMIT_RUNNING;
206 spin_unlock(&journal->j_list_lock); 229 spin_unlock(&journal->j_list_lock);
207 err = filemap_fdatawrite_range(mapping, 0, 230 /*
208 i_size_read(jinode->i_vfs_inode)); 231 * submit the inode data buffers. We use writepage
232 * instead of writepages. Because writepages can do
233 * block allocation with delalloc. We need to write
234 * only allocated blocks here.
235 */
236 err = journal_submit_inode_data_buffers(mapping);
209 if (!ret) 237 if (!ret)
210 ret = err; 238 ret = err;
211 spin_lock(&journal->j_list_lock); 239 spin_lock(&journal->j_list_lock);
@@ -228,7 +256,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
228 struct jbd2_inode *jinode, *next_i; 256 struct jbd2_inode *jinode, *next_i;
229 int err, ret = 0; 257 int err, ret = 0;
230 258
231 /* For locking, see the comment in journal_submit_inode_data_buffers() */ 259 /* For locking, see the comment in journal_submit_data_buffers() */
232 spin_lock(&journal->j_list_lock); 260 spin_lock(&journal->j_list_lock);
233 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 261 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
234 jinode->i_flags |= JI_COMMIT_RUNNING; 262 jinode->i_flags |= JI_COMMIT_RUNNING;
@@ -431,7 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
431 * Now start flushing things to disk, in the order they appear 459 * Now start flushing things to disk, in the order they appear
432 * on the transaction lists. Data blocks go first. 460 * on the transaction lists. Data blocks go first.
433 */ 461 */
434 err = journal_submit_inode_data_buffers(journal, commit_transaction); 462 err = journal_submit_data_buffers(journal, commit_transaction);
435 if (err) 463 if (err)
436 jbd2_journal_abort(journal, err); 464 jbd2_journal_abort(journal, err);
437 465