aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2008-07-11 19:27:31 -0400
committerTheodore Ts'o <tytso@mit.edu>2008-07-11 19:27:31 -0400
commitc851ed540173736e60d48b53b91a16ea5c903896 (patch)
tree828fe0d71b7f18dc170090dbb2fb5ac9deae4ee0
parentf4c0a0fdfae708f7aa438c27a380ed4071294e11 (diff)
jbd2: Implement data=ordered mode handling via inodes
This patch adds necessary framework into JBD2 to be able to track inodes with each transaction and write-out their dirty data during transaction commit time. This new ordered mode brings all sorts of advantages such as possibility to get rid of journal heads and buffer heads for data buffers in ordered mode, better ordering of writes on transaction commit, simplification of some JBD code, no more anonymous pages when truncate of data being committed happens. Also with this new ordered mode, delayed allocation on ordered mode is much simpler. Signed-off-by: Jan Kara <jack@suse.cz>
-rw-r--r--fs/jbd2/commit.c90
-rw-r--r--fs/jbd2/journal.c52
-rw-r--r--fs/jbd2/transaction.c86
-rw-r--r--include/linux/jbd2.h42
4 files changed, 270 insertions, 0 deletions
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 92b6ac3df8ab..3ca107b5c86b 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -355,6 +355,81 @@ write_out_data:
355 journal_do_submit_data(wbuf, bufs); 355 journal_do_submit_data(wbuf, bufs);
356} 356}
357 357
358/*
359 * Submit all the data buffers of inode associated with the transaction to
360 * disk.
361 *
362 * We are in a committing transaction. Therefore no new inode can be added to
363 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
364 * operate on from being released while we write out pages.
365 */
366static int journal_submit_inode_data_buffers(journal_t *journal,
367 transaction_t *commit_transaction)
368{
369 struct jbd2_inode *jinode;
370 int err, ret = 0;
371 struct address_space *mapping;
372
373 spin_lock(&journal->j_list_lock);
374 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
375 mapping = jinode->i_vfs_inode->i_mapping;
376 jinode->i_flags |= JI_COMMIT_RUNNING;
377 spin_unlock(&journal->j_list_lock);
378 err = filemap_fdatawrite_range(mapping, 0,
379 i_size_read(jinode->i_vfs_inode));
380 if (!ret)
381 ret = err;
382 spin_lock(&journal->j_list_lock);
383 J_ASSERT(jinode->i_transaction == commit_transaction);
384 jinode->i_flags &= ~JI_COMMIT_RUNNING;
385 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
386 }
387 spin_unlock(&journal->j_list_lock);
388 return ret;
389}
390
391/*
392 * Wait for data submitted for writeout, refile inodes to proper
393 * transaction if needed.
394 *
395 */
396static int journal_finish_inode_data_buffers(journal_t *journal,
397 transaction_t *commit_transaction)
398{
399 struct jbd2_inode *jinode, *next_i;
400 int err, ret = 0;
401
402 /* For locking, see the comment in journal_submit_inode_data_buffers() */
403 spin_lock(&journal->j_list_lock);
404 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
405 jinode->i_flags |= JI_COMMIT_RUNNING;
406 spin_unlock(&journal->j_list_lock);
407 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
408 if (!ret)
409 ret = err;
410 spin_lock(&journal->j_list_lock);
411 jinode->i_flags &= ~JI_COMMIT_RUNNING;
412 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
413 }
414
415 /* Now refile inode to proper lists */
416 list_for_each_entry_safe(jinode, next_i,
417 &commit_transaction->t_inode_list, i_list) {
418 list_del(&jinode->i_list);
419 if (jinode->i_next_transaction) {
420 jinode->i_transaction = jinode->i_next_transaction;
421 jinode->i_next_transaction = NULL;
422 list_add(&jinode->i_list,
423 &jinode->i_transaction->t_inode_list);
424 } else {
425 jinode->i_transaction = NULL;
426 }
427 }
428 spin_unlock(&journal->j_list_lock);
429
430 return ret;
431}
432
358static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) 433static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
359{ 434{
360 struct page *page = bh->b_page; 435 struct page *page = bh->b_page;
@@ -529,6 +604,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
529 */ 604 */
530 err = 0; 605 err = 0;
531 journal_submit_data_buffers(journal, commit_transaction); 606 journal_submit_data_buffers(journal, commit_transaction);
607 err = journal_submit_inode_data_buffers(journal, commit_transaction);
608 if (err)
609 jbd2_journal_abort(journal, err);
532 610
533 /* 611 /*
534 * Wait for all previously submitted IO to complete if commit 612 * Wait for all previously submitted IO to complete if commit
@@ -760,6 +838,17 @@ start_journal_io:
760 __jbd2_journal_abort_hard(journal); 838 __jbd2_journal_abort_hard(journal);
761 } 839 }
762 840
841 /*
842 * This is the right place to wait for data buffers both for ASYNC
843 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
844 * the commit block went to disk (which happens above). If commit is
845 * SYNC, we need to wait for data buffers before we start writing
846 * commit block, which happens below in such setting.
847 */
848 err = journal_finish_inode_data_buffers(journal, commit_transaction);
849 if (err)
850 jbd2_journal_abort(journal, err);
851
763 /* Lo and behold: we have just managed to send a transaction to 852 /* Lo and behold: we have just managed to send a transaction to
764 the log. Before we can commit it, wait for the IO so far to 853 the log. Before we can commit it, wait for the IO so far to
765 complete. Control buffers being written are on the 854 complete. Control buffers being written are on the
@@ -880,6 +969,7 @@ wait_for_iobuf:
880 jbd_debug(3, "JBD: commit phase 7\n"); 969 jbd_debug(3, "JBD: commit phase 7\n");
881 970
882 J_ASSERT(commit_transaction->t_sync_datalist == NULL); 971 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
972 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
883 J_ASSERT(commit_transaction->t_buffers == NULL); 973 J_ASSERT(commit_transaction->t_buffers == NULL);
884 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 974 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
885 J_ASSERT(commit_transaction->t_iobuf_list == NULL); 975 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a79..78cf7bd7f604 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -82,6 +82,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
82EXPORT_SYMBOL(jbd2_journal_invalidatepage); 82EXPORT_SYMBOL(jbd2_journal_invalidatepage);
83EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); 83EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
84EXPORT_SYMBOL(jbd2_journal_force_commit); 84EXPORT_SYMBOL(jbd2_journal_force_commit);
85EXPORT_SYMBOL(jbd2_journal_file_inode);
86EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
87EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
88EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
85 89
86static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); 90static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
87static void __journal_abort_soft (journal_t *journal, int errno); 91static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2195,6 +2199,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
2195} 2199}
2196 2200
2197/* 2201/*
2202 * Initialize jbd inode head
2203 */
2204void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
2205{
2206 jinode->i_transaction = NULL;
2207 jinode->i_next_transaction = NULL;
2208 jinode->i_vfs_inode = inode;
2209 jinode->i_flags = 0;
2210 INIT_LIST_HEAD(&jinode->i_list);
2211}
2212
2213/*
2214 * Function to be called before we start removing inode from memory (i.e.,
2215 * clear_inode() is a fine place to be called from). It removes inode from
2216 * transaction's lists.
2217 */
2218void jbd2_journal_release_jbd_inode(journal_t *journal,
2219 struct jbd2_inode *jinode)
2220{
2221 int writeout = 0;
2222
2223 if (!journal)
2224 return;
2225restart:
2226 spin_lock(&journal->j_list_lock);
2227 /* Is commit writing out inode - we have to wait */
2228 if (jinode->i_flags & JI_COMMIT_RUNNING) {
2229 wait_queue_head_t *wq;
2230 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
2231 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
2232 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2233 spin_unlock(&journal->j_list_lock);
2234 schedule();
2235 finish_wait(wq, &wait.wait);
2236 goto restart;
2237 }
2238
2239 /* Do we need to wait for data writeback? */
2240 if (journal->j_committing_transaction == jinode->i_transaction)
2241 writeout = 1;
2242 if (jinode->i_transaction) {
2243 list_del(&jinode->i_list);
2244 jinode->i_transaction = NULL;
2245 }
2246 spin_unlock(&journal->j_list_lock);
2247}
2248
2249/*
2198 * debugfs tunables 2250 * debugfs tunables
2199 */ 2251 */
2200#ifdef CONFIG_JBD2_DEBUG 2252#ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index ba620c4493d2..98b596d23705 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -51,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
51 transaction->t_tid = journal->j_transaction_sequence++; 51 transaction->t_tid = journal->j_transaction_sequence++;
52 transaction->t_expires = jiffies + journal->j_commit_interval; 52 transaction->t_expires = jiffies + journal->j_commit_interval;
53 spin_lock_init(&transaction->t_handle_lock); 53 spin_lock_init(&transaction->t_handle_lock);
54 INIT_LIST_HEAD(&transaction->t_inode_list);
54 55
55 /* Set up the commit timer for the new transaction. */ 56 /* Set up the commit timer for the new transaction. */
56 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 57 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -2195,3 +2196,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2195 spin_unlock(&journal->j_list_lock); 2196 spin_unlock(&journal->j_list_lock);
2196 __brelse(bh); 2197 __brelse(bh);
2197} 2198}
2199
2200/*
2201 * File inode in the inode list of the handle's transaction
2202 */
2203int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
2204{
2205 transaction_t *transaction = handle->h_transaction;
2206 journal_t *journal = transaction->t_journal;
2207
2208 if (is_handle_aborted(handle))
2209 return -EIO;
2210
2211 jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
2212 transaction->t_tid);
2213
2214 /*
2215 * First check whether inode isn't already on the transaction's
2216 * lists without taking the lock. Note that this check is safe
2217 * without the lock as we cannot race with somebody removing inode
2218 * from the transaction. The reason is that we remove inode from the
2219 * transaction only in journal_release_jbd_inode() and when we commit
2220 * the transaction. We are guarded from the first case by holding
2221 * a reference to the inode. We are safe against the second case
2222 * because if jinode->i_transaction == transaction, commit code
2223 * cannot touch the transaction because we hold reference to it,
2224 * and if jinode->i_next_transaction == transaction, commit code
2225 * will only file the inode where we want it.
2226 */
2227 if (jinode->i_transaction == transaction ||
2228 jinode->i_next_transaction == transaction)
2229 return 0;
2230
2231 spin_lock(&journal->j_list_lock);
2232
2233 if (jinode->i_transaction == transaction ||
2234 jinode->i_next_transaction == transaction)
2235 goto done;
2236
2237 /* On some different transaction's list - should be
2238 * the committing one */
2239 if (jinode->i_transaction) {
2240 J_ASSERT(jinode->i_next_transaction == NULL);
2241 J_ASSERT(jinode->i_transaction ==
2242 journal->j_committing_transaction);
2243 jinode->i_next_transaction = transaction;
2244 goto done;
2245 }
2246 /* Not on any transaction list... */
2247 J_ASSERT(!jinode->i_next_transaction);
2248 jinode->i_transaction = transaction;
2249 list_add(&jinode->i_list, &transaction->t_inode_list);
2250done:
2251 spin_unlock(&journal->j_list_lock);
2252
2253 return 0;
2254}
2255
2256/*
2257 * This function must be called when inode is journaled in ordered mode
2258 * before truncation happens. It starts writeout of truncated part in
2259 * case it is in the committing transaction so that we stand to ordered
2260 * mode consistency guarantees.
2261 */
2262int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
2263 loff_t new_size)
2264{
2265 journal_t *journal;
2266 transaction_t *commit_trans;
2267 int ret = 0;
2268
2269 if (!inode->i_transaction && !inode->i_next_transaction)
2270 goto out;
2271 journal = inode->i_transaction->t_journal;
2272 spin_lock(&journal->j_state_lock);
2273 commit_trans = journal->j_committing_transaction;
2274 spin_unlock(&journal->j_state_lock);
2275 if (inode->i_transaction == commit_trans) {
2276 ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
2277 new_size, LLONG_MAX);
2278 if (ret)
2279 jbd2_journal_abort(journal, ret);
2280 }
2281out:
2282 return ret;
2283}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index ec9cadf58227..622c3d8ca4ed 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -381,6 +381,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
381 bit_spin_unlock(BH_JournalHead, &bh->b_state); 381 bit_spin_unlock(BH_JournalHead, &bh->b_state);
382} 382}
383 383
384/* Flags in jbd_inode->i_flags */
385#define __JI_COMMIT_RUNNING 0
386/* Commit of the inode data in progress. We use this flag to protect us from
387 * concurrent deletion of inode. We cannot use reference to inode for this
388 * since we cannot afford doing last iput() on behalf of kjournald
389 */
390#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
391
392/**
393 * struct jbd_inode is the structure linking inodes in ordered mode
394 * present in a transaction so that we can sync them during commit.
395 */
396struct jbd2_inode {
397 /* Which transaction does this inode belong to? Either the running
398 * transaction or the committing one. [j_list_lock] */
399 transaction_t *i_transaction;
400
401 /* Pointer to the running transaction modifying inode's data in case
402 * there is already a committing transaction touching it. [j_list_lock] */
403 transaction_t *i_next_transaction;
404
405 /* List of inodes in the i_transaction [j_list_lock] */
406 struct list_head i_list;
407
408 /* VFS inode this inode belongs to [constant during the lifetime
409 * of the structure] */
410 struct inode *i_vfs_inode;
411
412 /* Flags of inode [j_list_lock] */
413 unsigned int i_flags;
414};
415
384struct jbd2_revoke_table_s; 416struct jbd2_revoke_table_s;
385 417
386/** 418/**
@@ -567,6 +599,12 @@ struct transaction_s
567 struct journal_head *t_log_list; 599 struct journal_head *t_log_list;
568 600
569 /* 601 /*
602 * List of inodes whose data we've modified in data=ordered mode.
603 * [j_list_lock]
604 */
605 struct list_head t_inode_list;
606
607 /*
570 * Protects info related to handles 608 * Protects info related to handles
571 */ 609 */
572 spinlock_t t_handle_lock; 610 spinlock_t t_handle_lock;
@@ -1046,6 +1084,10 @@ extern void jbd2_journal_ack_err (journal_t *);
1046extern int jbd2_journal_clear_err (journal_t *); 1084extern int jbd2_journal_clear_err (journal_t *);
1047extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); 1085extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
1048extern int jbd2_journal_force_commit(journal_t *); 1086extern int jbd2_journal_force_commit(journal_t *);
1087extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
1088extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
1089extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
1090extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
1049 1091
1050/* 1092/*
1051 * journal_head management 1093 * journal_head management