From 736603ab297506f4396cb5af592004499950fcfd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: jbd2: Add commit time into the commit block Carlo Wood has demonstrated that it's possible to recover deleted files from the journal. Something that will make this easier is if we can put the time of the commit into commit block. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index a2ed72f7ceee..92b6ac3df8ab 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -112,6 +112,7 @@ static int journal_submit_commit_record(journal_t *journal, struct buffer_head *bh; int ret; int barrier_done = 0; + struct timespec now = current_kernel_time(); if (is_journal_aborted(journal)) return 0; @@ -126,6 +127,8 @@ static int journal_submit_commit_record(journal_t *journal, tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + tmp->h_commit_sec = cpu_to_be64(now.tv_sec); + tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { -- cgit v1.2.2 From c851ed540173736e60d48b53b91a16ea5c903896 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: jbd2: Implement data=ordered mode handling via inodes This patch adds necessary framework into JBD2 to be able to track inodes with each transaction and write-out their dirty data during transaction commit time. This new ordered mode brings all sorts of advantages such as possibility to get rid of journal heads and buffer heads for data buffers in ordered mode, better ordering of writes on transaction commit, simplification of some JBD code, no more anonymous pages when truncate of data being committed happens. Also with this new ordered mode, delayed allocation on ordered mode is much simpler. Signed-off-by: Jan Kara --- fs/jbd2/commit.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 92b6ac3df8ab..3ca107b5c86b 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -355,6 +355,81 @@ write_out_data: journal_do_submit_data(wbuf, bufs); } +/* + * Submit all the data buffers of inode associated with the transaction to + * disk. + * + * We are in a committing transaction. Therefore no new inode can be added to + * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently + * operate on from being released while we write out pages. + */ +static int journal_submit_inode_data_buffers(journal_t *journal, + transaction_t *commit_transaction) +{ + struct jbd2_inode *jinode; + int err, ret = 0; + struct address_space *mapping; + + spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + mapping = jinode->i_vfs_inode->i_mapping; + jinode->i_flags |= JI_COMMIT_RUNNING; + spin_unlock(&journal->j_list_lock); + err = filemap_fdatawrite_range(mapping, 0, + i_size_read(jinode->i_vfs_inode)); + if (!ret) + ret = err; + spin_lock(&journal->j_list_lock); + J_ASSERT(jinode->i_transaction == commit_transaction); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + } + spin_unlock(&journal->j_list_lock); + return ret; +} + +/* + * Wait for data submitted for writeout, refile inodes to proper + * transaction if needed. + * + */ +static int journal_finish_inode_data_buffers(journal_t *journal, + transaction_t *commit_transaction) +{ + struct jbd2_inode *jinode, *next_i; + int err, ret = 0; + + /* For locking, see the comment in journal_submit_inode_data_buffers() */ + spin_lock(&journal->j_list_lock); + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + jinode->i_flags |= JI_COMMIT_RUNNING; + spin_unlock(&journal->j_list_lock); + err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); + if (!ret) + ret = err; + spin_lock(&journal->j_list_lock); + jinode->i_flags &= ~JI_COMMIT_RUNNING; + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + } + + /* Now refile inode to proper lists */ + list_for_each_entry_safe(jinode, next_i, + &commit_transaction->t_inode_list, i_list) { + list_del(&jinode->i_list); + if (jinode->i_next_transaction) { + jinode->i_transaction = jinode->i_next_transaction; + jinode->i_next_transaction = NULL; + list_add(&jinode->i_list, + &jinode->i_transaction->t_inode_list); + } else { + jinode->i_transaction = NULL; + } + } + spin_unlock(&journal->j_list_lock); + + return ret; +} + static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) { struct page *page = bh->b_page; @@ -529,6 +604,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ err = 0; journal_submit_data_buffers(journal, commit_transaction); + err = journal_submit_inode_data_buffers(journal, commit_transaction); + if (err) + jbd2_journal_abort(journal, err); /* * Wait for all previously submitted IO to complete if commit @@ -760,6 +838,17 @@ start_journal_io: __jbd2_journal_abort_hard(journal); } + /* + * This is the right place to wait for data buffers both for ASYNC + * and !ASYNC commit. If commit is ASYNC, we need to wait only after + * the commit block went to disk (which happens above). If commit is + * SYNC, we need to wait for data buffers before we start writing + * commit block, which happens below in such setting. + */ + err = journal_finish_inode_data_buffers(journal, commit_transaction); + if (err) + jbd2_journal_abort(journal, err); + /* Lo and behold: we have just managed to send a transaction to the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the @@ -880,6 +969,7 @@ wait_for_iobuf: jbd_debug(3, "JBD: commit phase 7\n"); J_ASSERT(commit_transaction->t_sync_datalist == NULL); + J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); J_ASSERT(commit_transaction->t_checkpoint_list == NULL); J_ASSERT(commit_transaction->t_iobuf_list == NULL); -- cgit v1.2.2 From 87c89c232c8f7b3820c33c3b9bc803e9358027da Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: jbd2: Remove data=ordered mode support using jbd buffer heads Signed-off-by: Jan Kara --- fs/jbd2/commit.c | 221 ++----------------------------------------------------- 1 file changed, 8 insertions(+), 213 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3ca107b5c86b..483183d15ed5 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -37,8 +37,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) } /* - * When an ext3-ordered file is truncated, it is possible that many pages are - * not sucessfully freed, because they are attached to a committing transaction. + * When an ext4 file is truncated, it is possible that some pages are not + * successfully freed, because they are attached to a committing transaction. * After the transaction commits, these pages are left on the LRU, with no * ->mapping, and with attached buffers. These pages are trivially reclaimable * by the VM, but their apparent absence upsets the VM accounting, and it makes @@ -79,21 +79,6 @@ nope: __brelse(bh); } -/* - * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is - * held. For ranking reasons we must trylock. If we lose, schedule away and - * return 0. j_list_lock is dropped in this case. - */ -static int inverted_lock(journal_t *journal, struct buffer_head *bh) -{ - if (!jbd_trylock_bh_state(bh)) { - spin_unlock(&journal->j_list_lock); - schedule(); - return 0; - } - return 1; -} - /* * Done it all: now submit the commit record. We should have * cleaned up our previous buffers by now, so if we are in abort @@ -199,162 +184,6 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) return ret; } -/* - * Wait for all submitted IO to complete. - */ -static int journal_wait_on_locked_list(journal_t *journal, - transaction_t *commit_transaction) -{ - int ret = 0; - struct journal_head *jh; - - while (commit_transaction->t_locked_list) { - struct buffer_head *bh; - - jh = commit_transaction->t_locked_list->b_tprev; - bh = jh2bh(jh); - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - ret = -EIO; - spin_lock(&journal->j_list_lock); - } - if (!inverted_lock(journal, bh)) { - put_bh(bh); - spin_lock(&journal->j_list_lock); - continue; - } - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { - __jbd2_journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } - put_bh(bh); - cond_resched_lock(&journal->j_list_lock); - } - return ret; - } - -static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) -{ - int i; - - for (i = 0; i < bufs; i++) { - wbuf[i]->b_end_io = end_buffer_write_sync; - /* We use-up our safety reference in submit_bh() */ - submit_bh(WRITE, wbuf[i]); - } -} - -/* - * Submit all the data buffers to disk - */ -static void journal_submit_data_buffers(journal_t *journal, - transaction_t *commit_transaction) -{ - struct journal_head *jh; - struct buffer_head *bh; - int locked; - int bufs = 0; - struct buffer_head **wbuf = journal->j_wbuf; - - /* - * Whenever we unlock the journal and sleep, things can get added - * onto ->t_sync_datalist, so we have to keep looping back to - * write_out_data until we *know* that the list is empty. - * - * Cleanup any flushed data buffers from the data list. Even in - * abort mode, we want to flush this out as soon as possible. - */ -write_out_data: - cond_resched(); - spin_lock(&journal->j_list_lock); - - while (commit_transaction->t_sync_datalist) { - jh = commit_transaction->t_sync_datalist; - bh = jh2bh(jh); - locked = 0; - - /* Get reference just to make sure buffer does not disappear - * when we are forced to drop various locks */ - get_bh(bh); - /* If the buffer is dirty, we need to submit IO and hence - * we need the buffer lock. We try to lock the buffer without - * blocking. If we fail, we need to drop j_list_lock and do - * blocking lock_buffer(). - */ - if (buffer_dirty(bh)) { - if (test_set_buffer_locked(bh)) { - BUFFER_TRACE(bh, "needs blocking lock"); - spin_unlock(&journal->j_list_lock); - /* Write out all data to prevent deadlocks */ - journal_do_submit_data(wbuf, bufs); - bufs = 0; - lock_buffer(bh); - spin_lock(&journal->j_list_lock); - } - locked = 1; - } - /* We have to get bh_state lock. Again out of order, sigh. */ - if (!inverted_lock(journal, bh)) { - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - } - /* Someone already cleaned up the buffer? */ - if (!buffer_jbd(bh) - || jh->b_transaction != commit_transaction - || jh->b_jlist != BJ_SyncData) { - jbd_unlock_bh_state(bh); - if (locked) - unlock_buffer(bh); - BUFFER_TRACE(bh, "already cleaned up"); - put_bh(bh); - continue; - } - if (locked && test_clear_buffer_dirty(bh)) { - BUFFER_TRACE(bh, "needs writeout, adding to array"); - wbuf[bufs++] = bh; - __jbd2_journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - if (bufs == journal->j_wbufsize) { - spin_unlock(&journal->j_list_lock); - journal_do_submit_data(wbuf, bufs); - bufs = 0; - goto write_out_data; - } - } else if (!locked && buffer_locked(bh)) { - __jbd2_journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - put_bh(bh); - } else { - BUFFER_TRACE(bh, "writeout complete: unfile"); - __jbd2_journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - if (locked) - unlock_buffer(bh); - jbd2_journal_remove_journal_head(bh); - /* Once for our safety reference, once for - * jbd2_journal_remove_journal_head() */ - put_bh(bh); - put_bh(bh); - } - - if (need_resched() || spin_needbreak(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; - } - } - spin_unlock(&journal->j_list_lock); - journal_do_submit_data(wbuf, bufs); -} - /* * Submit all the data buffers of inode associated with the transaction to * disk. @@ -602,24 +431,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ - err = 0; - journal_submit_data_buffers(journal, commit_transaction); err = journal_submit_inode_data_buffers(journal, commit_transaction); - if (err) - jbd2_journal_abort(journal, err); - - /* - * Wait for all previously submitted IO to complete if commit - * record is to be written synchronously. - */ - spin_lock(&journal->j_list_lock); - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) - err = journal_wait_on_locked_list(journal, - commit_transaction); - - spin_unlock(&journal->j_list_lock); - if (err) jbd2_journal_abort(journal, err); @@ -627,16 +439,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(3, "JBD: commit phase 2\n"); - /* - * If we found any dirty or locked buffers, then we should have - * looped back up to the write_out_data label. If there weren't - * any then journal_clean_data_list should have wiped the list - * clean by now, so check that it is in fact empty. - */ - J_ASSERT (commit_transaction->t_sync_datalist == NULL); - - jbd_debug (3, "JBD: commit phase 3\n"); - /* * Way to go: we have now written out all of the data for a * transaction! Now comes the tricky part: we need to write out @@ -655,6 +457,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); + err = 0; descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { @@ -829,13 +632,6 @@ start_journal_io: &cbh, crc32_sum); if (err) __jbd2_journal_abort_hard(journal); - - spin_lock(&journal->j_list_lock); - err = journal_wait_on_locked_list(journal, - commit_transaction); - spin_unlock(&journal->j_list_lock); - if (err) - __jbd2_journal_abort_hard(journal); } /* @@ -860,7 +656,7 @@ start_journal_io: so we incur less scheduling load. */ - jbd_debug(3, "JBD: commit phase 4\n"); + jbd_debug(3, "JBD: commit phase 3\n"); /* * akpm: these are BJ_IO, and j_list_lock is not needed. @@ -919,7 +715,7 @@ wait_for_iobuf: J_ASSERT (commit_transaction->t_shadow_list == NULL); - jbd_debug(3, "JBD: commit phase 5\n"); + jbd_debug(3, "JBD: commit phase 4\n"); /* Here we wait for the revoke record and descriptor record buffers */ wait_for_ctlbuf: @@ -946,7 +742,7 @@ wait_for_iobuf: /* AKPM: bforget here */ } - jbd_debug(3, "JBD: commit phase 6\n"); + jbd_debug(3, "JBD: commit phase 5\n"); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { @@ -966,9 +762,8 @@ wait_for_iobuf: transaction can be removed from any checkpoint list it was on before. */ - jbd_debug(3, "JBD: commit phase 7\n"); + jbd_debug(3, "JBD: commit phase 6\n"); - J_ASSERT(commit_transaction->t_sync_datalist == NULL); J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); J_ASSERT(commit_transaction->t_checkpoint_list == NULL); @@ -1090,7 +885,7 @@ restart_loop: /* Done with this transaction! */ - jbd_debug(3, "JBD: commit phase 8\n"); + jbd_debug(3, "JBD: commit phase 7\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT); -- cgit v1.2.2 From cd1aac32923a9c8adcc0ae85e33c1ca0c5855838 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: ext4: Add ordered mode support for delalloc This provides a new ordered mode implementation which gets rid of using buffer heads to enforce the ordering between metadata change with the related data chage. Instead, in the new ordering mode, it keeps track of all of the inodes touched by each transaction on a list, and when that transaction is committed, it flushes all of the dirty pages for those inodes. In addition, the new ordered mode reverses the lock ordering of the page lock and transaction lock, which provides easier support for delayed allocation. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) (limited to 'fs/jbd2/commit.c') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 483183d15ed5..f8b3be873226 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -184,6 +186,27 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) return ret; } +/* + * write the filemap data using writepage() address_space_operations. + * We don't do block allocation here even for delalloc. We don't + * use writepages() because with dealyed allocation we may be doing + * block allocation in writepages(). + */ +static int journal_submit_inode_data_buffers(struct address_space *mapping) +{ + int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = mapping->nrpages * 2, + .range_start = 0, + .range_end = i_size_read(mapping->host), + .for_writepages = 1, + }; + + ret = generic_writepages(mapping, &wbc); + return ret; +} + /* * Submit all the data buffers of inode associated with the transaction to * disk. @@ -192,7 +215,7 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently * operate on from being released while we write out pages. */ -static int journal_submit_inode_data_buffers(journal_t *journal, +static int journal_submit_data_buffers(journal_t *journal, transaction_t *commit_transaction) { struct jbd2_inode *jinode; @@ -204,8 +227,13 @@ static int journal_submit_inode_data_buffers(journal_t *journal, mapping = jinode->i_vfs_inode->i_mapping; jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); - err = filemap_fdatawrite_range(mapping, 0, - i_size_read(jinode->i_vfs_inode)); + /* + * submit the inode data buffers. We use writepage + * instead of writepages. Because writepages can do + * block allocation with delalloc. We need to write + * only allocated blocks here. + */ + err = journal_submit_inode_data_buffers(mapping); if (!ret) ret = err; spin_lock(&journal->j_list_lock); @@ -228,7 +256,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, struct jbd2_inode *jinode, *next_i; int err, ret = 0; - /* For locking, see the comment in journal_submit_inode_data_buffers() */ + /* For locking, see the comment in journal_submit_data_buffers() */ spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { jinode->i_flags |= JI_COMMIT_RUNNING; @@ -431,7 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ - err = journal_submit_inode_data_buffers(journal, commit_transaction); + err = journal_submit_data_buffers(journal, commit_transaction); if (err) jbd2_journal_abort(journal, err); -- cgit v1.2.2