Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks

author: Ingo Molnar <mingo@elte.hu> 2008-10-28 11:26:12 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-10-28 11:26:12 -0400
commit: 7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch)
tree: e730a4565e0318140d2fbd2f0415d18a339d7336 /fs/jbd
parent: 41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff)
parent: 0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff)
7 files changed, 318 insertions, 141 deletions
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig
new file mode 100644
index 000000000000..4e28beeed157
--- /dev/null
+++ b/fs/jbd/Kconfig
@@ -0,0 +1,30 @@
+config JBD
+        tristate
+        help
+          This is a generic journalling layer for block devices.  It is
+          currently used by the ext3 file system, but it could also be
+          used to add journal support to other file systems or block
+          devices such as RAID or LVM.
+          If you are using the ext3 file system, you need to say Y here.
+          If you are not using ext3 then you will probably want to say N.
+          To compile this device as a module, choose M here: the module will be
+          called jbd.  If you are compiling ext3 into the kernel, you
+          cannot compile this code as a module.
+config JBD_DEBUG
+        bool "JBD (ext3) debugging support"
+        depends on JBD && DEBUG_FS
+        help
+          If you are using the ext3 journaled file system (or potentially any
+          other file system/device using JBD), this option allows you to
+          enable debugging output while the system is running, in order to
+          help track down any problems you are having.  By default the
+          debugging output will be turned off.
+          If you select Y here, then you will be able to turn on debugging
+          with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
+          number between 1 and 5, the higher the number, the more debugging
+          output is generated.  To turn debugging off again, do
+          "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index a5432bbbfb88..1bd8d4acc6f2 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -93,7 +93,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
        int ret = 0;
        struct buffer_head *bh = jh2bh(jh);
-        if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+        if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+            !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
                JBUFFER_TRACE(jh, "remove from checkpoint list");
                ret = __journal_remove_checkpoint(jh) + 1;
                jbd_unlock_bh_state(bh);
@@ -126,14 +127,29 @@ void __log_wait_for_space(journal_t *journal)
                /*
                 * Test again, another process may have checkpointed while we
-                 * were waiting for the checkpoint lock
+                 * were waiting for the checkpoint lock. If there are no
+                 * outstanding transactions there is nothing to checkpoint and
+                 * we can't make progress. Abort the journal in this case.
                 */
                spin_lock(&journal->j_state_lock);
+                spin_lock(&journal->j_list_lock);
                nblocks = jbd_space_needed(journal);
                if (__log_space_left(journal) < nblocks) {
+                        int chkpt = journal->j_checkpoint_transactions != NULL;
+                        spin_unlock(&journal->j_list_lock);
                        spin_unlock(&journal->j_state_lock);
-                        log_do_checkpoint(journal);
+                        if (chkpt) {
+                                log_do_checkpoint(journal);
+                        } else {
+                                printk(KERN_ERR "%s: no transactions\n",
+                                       __func__);
+                                journal_abort(journal, 0);
+                        }
                        spin_lock(&journal->j_state_lock);
+                } else {
+                        spin_unlock(&journal->j_list_lock);
                }
                mutex_unlock(&journal->j_checkpoint_mutex);
        }
@@ -160,21 +176,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 * buffers. Note that we take the buffers in the opposite ordering
 * from the one in which they were submitted for IO.
 *
+ * Return 0 on success, and return <0 if some buffers have failed
+ * to be written out.
+ *
 * Called with j_list_lock held.
 */
-static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
        struct journal_head *jh;
        struct buffer_head *bh;
        tid_t this_tid;
        int released = 0;
+        int ret = 0;
        this_tid = transaction->t_tid;
 restart:
        /* Did somebody clean up the transaction in the meanwhile? */
        if (journal->j_checkpoint_transactions != transaction ||
                        transaction->t_tid != this_tid)
-                return;
+                return ret;
        while (!released && transaction->t_checkpoint_io_list) {
                jh = transaction->t_checkpoint_io_list;
                bh = jh2bh(jh);
@@ -194,6 +214,9 @@ restart:
                        spin_lock(&journal->j_list_lock);
                        goto restart;
                }
+                if (unlikely(buffer_write_io_error(bh)))
+                        ret = -EIO;
                /*
                 * Now in whatever state the buffer currently is, we know that
                 * it has been written out and so we can drop it from the list
@@ -203,6 +226,8 @@ restart:
                journal_remove_journal_head(bh);
                __brelse(bh);
        }
+        return ret;
 }
 #define NR_BATCH        64
@@ -226,7 +251,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 * Try to flush one buffer from the checkpoint list to disk.
 *
 * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.
+ * scan of the checkpoint list.  Return <0 if the buffer has failed to
+ * be written out.
 *
 * Called with j_list_lock held and drops it if 1 is returned
 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -256,6 +282,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                log_wait_commit(journal, tid);
                ret = 1;
        } else if (!buffer_dirty(bh)) {
+                ret = 1;
+                if (unlikely(buffer_write_io_error(bh)))
+                        ret = -EIO;
                J_ASSERT_JH(jh, !buffer_jbddirty(bh));
                BUFFER_TRACE(bh, "remove from checkpoint");
                __journal_remove_checkpoint(jh);
@@ -263,7 +292,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                jbd_unlock_bh_state(bh);
                journal_remove_journal_head(bh);
                __brelse(bh);
-                ret = 1;
        } else {
                /*
                 * Important: we are about to write the buffer, and
@@ -295,6 +323,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 * to disk. We submit larger chunks of data at once.
 *
 * The journal should be locked before calling this function.
+ * Called with j_checkpoint_mutex held.
 */
 int log_do_checkpoint(journal_t *journal)
 {
@@ -318,6 +347,7 @@ int log_do_checkpoint(journal_t *journal)
         * OK, we need to start writing disk blocks.  Take one transaction
         * and write it.
         */
+        result = 0;
        spin_lock(&journal->j_list_lock);
        if (!journal->j_checkpoint_transactions)
                goto out;
@@ -334,7 +364,7 @@ restart:
                int batch_count = 0;
                struct buffer_head *bhs[NR_BATCH];
                struct journal_head *jh;
-                int retry = 0;
+                int retry = 0, err;
                while (!retry && transaction->t_checkpoint_list) {
                        struct buffer_head *bh;
@@ -347,6 +377,8 @@ restart:
                                break;
                        }
                        retry = __process_buffer(journal, jh, bhs,&batch_count);
+                        if (retry < 0 && !result)
+                                result = retry;
                        if (!retry && (need_resched() ||
                                spin_needbreak(&journal->j_list_lock))) {
                                spin_unlock(&journal->j_list_lock);
@@ -371,14 +403,18 @@ restart:
                 * Now we have cleaned up the first transaction's checkpoint
                 * list. Let's clean up the second one
                 */
-                __wait_cp_io(journal, transaction);
+                err = __wait_cp_io(journal, transaction);
+                if (!result)
+                        result = err;
        }
 out:
        spin_unlock(&journal->j_list_lock);
-        result = cleanup_journal_tail(journal);
        if (result < 0)
-                return result;
+                journal_abort(journal, result);
-        return 0;
+        else
+                result = cleanup_journal_tail(journal);
+        return (result < 0) ? result : 0;
 }
 /*
@@ -394,8 +430,9 @@ out:
 * This is the only part of the journaling code which really needs to be
 * aware of transaction aborts.  Checkpointing involves writing to the
 * main filesystem area rather than to the journal, so it can proceed
- * even in abort state, but we must not update the journal superblock if
+ * even in abort state, but we must not update the super block if
- * we have an abort error outstanding.
+ * checkpointing may have failed.  Otherwise, we would lose some metadata
+ * buffers which should be written-back to the filesystem.
 */
 int cleanup_journal_tail(journal_t *journal)
@@ -404,6 +441,9 @@ int cleanup_journal_tail(journal_t *journal)
        tid_t           first_tid;
        unsigned long   blocknr, freed;
+        if (is_journal_aborted(journal))
+                return 1;
        /* OK, work out the oldest transaction remaining in the log, and
         * the log block it starts at.
         *
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 5a8ca61498ca..25719d902c51 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -36,7 +36,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 /*
 * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * not successfully freed, because they are attached to a committing transaction.
 * After the transaction commits, these pages are left on the LRU, with no
 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
 * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -45,8 +45,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 * So here, we have a buffer which has just come off the forget list.  Look to
 * see if we can strip all buffers from the backing page.
 *
- * Called under lock_journal(), and possibly under journal_datalist_lock.  The
+ * Called under journal->j_list_lock.  The caller provided us with a ref
- * caller provided us with a ref against the buffer, and we drop that here.
+ * against the buffer, and we drop that here.
 */
 static void release_buffer_page(struct buffer_head *bh)
 {
@@ -63,7 +63,7 @@ static void release_buffer_page(struct buffer_head *bh)
                goto nope;
        /* OK, it's a truncated page */
-        if (TestSetPageLocked(page))
+        if (!trylock_page(page))
                goto nope;
        page_cache_get(page);
@@ -78,6 +78,19 @@ nope:
 }
 /*
+ * Decrement reference counter for data buffer. If it has been marked
+ * 'BH_Freed', release it and the page to which it belongs if possible.
+ */
+static void release_data_buffer(struct buffer_head *bh)
+{
+        if (buffer_freed(bh)) {
+                clear_buffer_freed(bh);
+                release_buffer_page(bh);
+        } else
+                put_bh(bh);
+}
+/*
 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
 * return 0.  j_list_lock is dropped in this case.
@@ -172,7 +185,7 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 /*
 *  Submit all the data buffers to disk
 */
-static void journal_submit_data_buffers(journal_t *journal,
+static int journal_submit_data_buffers(journal_t *journal,
                                transaction_t *commit_transaction)
 {
        struct journal_head *jh;
@@ -180,6 +193,7 @@ static void journal_submit_data_buffers(journal_t *journal,
        int locked;
        int bufs = 0;
        struct buffer_head **wbuf = journal->j_wbuf;
+        int err = 0;
        /*
         * Whenever we unlock the journal and sleep, things can get added
@@ -207,7 +221,7 @@ write_out_data:
                 * blocking lock_buffer().
                 */
                if (buffer_dirty(bh)) {
-                        if (test_set_buffer_locked(bh)) {
+                        if (!trylock_buffer(bh)) {
                                BUFFER_TRACE(bh, "needs blocking lock");
                                spin_unlock(&journal->j_list_lock);
                                /* Write out all data to prevent deadlocks */
@@ -231,7 +245,7 @@ write_out_data:
                        if (locked)
                                unlock_buffer(bh);
                        BUFFER_TRACE(bh, "already cleaned up");
-                        put_bh(bh);
+                        release_data_buffer(bh);
                        continue;
                }
                if (locked && test_clear_buffer_dirty(bh)) {
@@ -253,15 +267,17 @@ write_out_data:
                        put_bh(bh);
                } else {
                        BUFFER_TRACE(bh, "writeout complete: unfile");
+                        if (unlikely(!buffer_uptodate(bh)))
+                                err = -EIO;
                        __journal_unfile_buffer(jh);
                        jbd_unlock_bh_state(bh);
                        if (locked)
                                unlock_buffer(bh);
                        journal_remove_journal_head(bh);
-                        /* Once for our safety reference, once for
+                        /* One for our safety reference, other for
                         * journal_remove_journal_head() */
                        put_bh(bh);
-                        put_bh(bh);
+                        release_data_buffer(bh);
                }
                if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
@@ -271,6 +287,8 @@ write_out_data:
        }
        spin_unlock(&journal->j_list_lock);
        journal_do_submit_data(wbuf, bufs);
+        return err;
 }
 /*
@@ -410,8 +428,7 @@ void journal_commit_transaction(journal_t *journal)
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
-        err = 0;
+        err = journal_submit_data_buffers(journal, commit_transaction);
-        journal_submit_data_buffers(journal, commit_transaction);
        /*
         * Wait for all previously submitted IO to complete.
@@ -426,10 +443,21 @@ void journal_commit_transaction(journal_t *journal)
                if (buffer_locked(bh)) {
                        spin_unlock(&journal->j_list_lock);
                        wait_on_buffer(bh);
-                        if (unlikely(!buffer_uptodate(bh)))
-                                err = -EIO;
                        spin_lock(&journal->j_list_lock);
                }
+                if (unlikely(!buffer_uptodate(bh))) {
+                        if (!trylock_page(bh->b_page)) {
+                                spin_unlock(&journal->j_list_lock);
+                                lock_page(bh->b_page);
+                                spin_lock(&journal->j_list_lock);
+                        }
+                        if (bh->b_page->mapping)
+                                set_bit(AS_EIO, &bh->b_page->mapping->flags);
+                        unlock_page(bh->b_page);
+                        SetPageError(bh->b_page);
+                        err = -EIO;
+                }
                if (!inverted_lock(journal, bh)) {
                        put_bh(bh);
                        spin_lock(&journal->j_list_lock);
@@ -443,17 +471,23 @@ void journal_commit_transaction(journal_t *journal)
                } else {
                        jbd_unlock_bh_state(bh);
                }
-                put_bh(bh);
+                release_data_buffer(bh);
                cond_resched_lock(&journal->j_list_lock);
        }
        spin_unlock(&journal->j_list_lock);
-        if (err)
+        if (err) {
-                journal_abort(journal, err);
+                char b[BDEVNAME_SIZE];
-        journal_write_revoke_records(journal, commit_transaction);
+                printk(KERN_WARNING
+                        "JBD: Detected IO errors while flushing file data "
+                        "on %s\n", bdevname(journal->j_fs_dev, b));
+                if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
+                        journal_abort(journal, err);
+                err = 0;
+        }
-        jbd_debug(3, "JBD: commit phase 2\n");
+        journal_write_revoke_records(journal, commit_transaction);
        /*
         * If we found any dirty or locked buffers, then we should have
@@ -486,9 +520,10 @@ void journal_commit_transaction(journal_t *journal)
                jh = commit_transaction->t_buffers;
                /* If we're in abort mode, we just un-journal the buffer and
-                   release it for background writing. */
+                   release it. */
                if (is_journal_aborted(journal)) {
+                        clear_buffer_jbddirty(jh2bh(jh));
                        JBUFFER_TRACE(jh, "journal is aborting: refile");
                        journal_refile_buffer(journal, jh);
                        /* If that was the last one, we need to clean up
@@ -730,6 +765,9 @@ wait_for_iobuf:
                /* AKPM: bforget here */
        }
+        if (err)
+                journal_abort(journal, err);
        jbd_debug(3, "JBD: commit phase 6\n");
        if (journal_write_commit_record(journal, commit_transaction))
@@ -820,6 +858,8 @@ restart_loop:
                if (buffer_jbddirty(bh)) {
                        JBUFFER_TRACE(jh, "add to new checkpointing trans");
                        __journal_insert_checkpoint(jh, commit_transaction);
+                        if (is_journal_aborted(journal))
+                                clear_buffer_jbddirty(bh);
                        JBUFFER_TRACE(jh, "refile for checkpoint writeback");
                        __journal_refile_buffer(jh);
                        jbd_unlock_bh_state(bh);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b99c3b3654c4..9e4fa52d7dc8 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -68,7 +68,6 @@ EXPORT_SYMBOL(journal_set_features);
 EXPORT_SYMBOL(journal_create);
 EXPORT_SYMBOL(journal_load);
 EXPORT_SYMBOL(journal_destroy);
-EXPORT_SYMBOL(journal_update_superblock);
 EXPORT_SYMBOL(journal_abort);
 EXPORT_SYMBOL(journal_errno);
 EXPORT_SYMBOL(journal_ack_err);
@@ -1122,9 +1121,12 @@ recovery_error:
 *
 * Release a journal_t structure once it is no longer in use by the
 * journaled object.
+ * Return <0 if we couldn't clean up the journal.
 */
-void journal_destroy(journal_t *journal)
+int journal_destroy(journal_t *journal)
 {
+        int err = 0;
        /* Wait for the commit thread to wake up and die. */
        journal_kill_thread(journal);
@@ -1147,11 +1149,16 @@ void journal_destroy(journal_t *journal)
        J_ASSERT(journal->j_checkpoint_transactions == NULL);
        spin_unlock(&journal->j_list_lock);
-        /* We can now mark the journal as empty. */
-        journal->j_tail = 0;
-        journal->j_tail_sequence = ++journal->j_transaction_sequence;
        if (journal->j_sb_buffer) {
-                journal_update_superblock(journal, 1);
+                if (!is_journal_aborted(journal)) {
+                        /* We can now mark the journal as empty. */
+                        journal->j_tail = 0;
+                        journal->j_tail_sequence =
+                                ++journal->j_transaction_sequence;
+                        journal_update_superblock(journal, 1);
+                } else {
+                        err = -EIO;
+                }
                brelse(journal->j_sb_buffer);
        }
@@ -1161,6 +1168,8 @@ void journal_destroy(journal_t *journal)
                journal_destroy_revoke(journal);
        kfree(journal->j_wbuf);
        kfree(journal);
+        return err;
 }
@@ -1360,10 +1369,16 @@ int journal_flush(journal_t *journal)
        spin_lock(&journal->j_list_lock);
        while (!err && journal->j_checkpoint_transactions != NULL) {
                spin_unlock(&journal->j_list_lock);
+                mutex_lock(&journal->j_checkpoint_mutex);
                err = log_do_checkpoint(journal);
+                mutex_unlock(&journal->j_checkpoint_mutex);
                spin_lock(&journal->j_list_lock);
        }
        spin_unlock(&journal->j_list_lock);
+        if (is_journal_aborted(journal))
+                return -EIO;
        cleanup_journal_tail(journal);
        /* Finally, mark the journal as really needing no recovery.
@@ -1385,7 +1400,7 @@ int journal_flush(journal_t *journal)
        J_ASSERT(journal->j_head == journal->j_tail);
        J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
        spin_unlock(&journal->j_state_lock);
-        return err;
+        return 0;
 }
 /**
@@ -1636,9 +1651,10 @@ static int journal_init_journal_head_cache(void)
 static void journal_destroy_journal_head_cache(void)
 {
-        J_ASSERT(journal_head_cache != NULL);
+        if (journal_head_cache) {
-        kmem_cache_destroy(journal_head_cache);
+                kmem_cache_destroy(journal_head_cache);
-        journal_head_cache = NULL;
+                journal_head_cache = NULL;
+        }
 }
 /*
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 43bc5e5ed064..db5e982c5ddf 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -223,7 +223,7 @@ do {									\
 */
 int journal_recover(journal_t *journal)
 {
-        int                     err;
+        int                     err, err2;
        journal_superblock_t *  sb;
        struct recovery_info    info;
@@ -261,7 +261,10 @@ int journal_recover(journal_t *journal)
        journal->j_transaction_sequence = ++info.end_transaction;
        journal_clear_revoke(journal);
-        sync_blockdev(journal->j_fs_dev);
+        err2 = sync_blockdev(journal->j_fs_dev);
+        if (!err)
+                err = err2;
        return err;
 }
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 1bb43e987f4b..c7bd649bbbdc 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -166,138 +166,123 @@ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
        return NULL;
 }
+void journal_destroy_revoke_caches(void)
+{
+        if (revoke_record_cache) {
+                kmem_cache_destroy(revoke_record_cache);
+                revoke_record_cache = NULL;
+        }
+        if (revoke_table_cache) {
+                kmem_cache_destroy(revoke_table_cache);
+                revoke_table_cache = NULL;
+        }
+}
 int __init journal_init_revoke_caches(void)
 {
+        J_ASSERT(!revoke_record_cache);
+        J_ASSERT(!revoke_table_cache);
        revoke_record_cache = kmem_cache_create("revoke_record",
                                           sizeof(struct jbd_revoke_record_s),
                                           0,
                                           SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
                                           NULL);
        if (!revoke_record_cache)
-                return -ENOMEM;
+                goto record_cache_failure;
        revoke_table_cache = kmem_cache_create("revoke_table",
                                           sizeof(struct jbd_revoke_table_s),
                                           0, SLAB_TEMPORARY, NULL);
-        if (!revoke_table_cache) {
+        if (!revoke_table_cache)
-                kmem_cache_destroy(revoke_record_cache);
+                goto table_cache_failure;
-                revoke_record_cache = NULL;
-                return -ENOMEM;
-        }
        return 0;
-}
-void journal_destroy_revoke_caches(void)
+table_cache_failure:
-{
+        journal_destroy_revoke_caches();
-        kmem_cache_destroy(revoke_record_cache);
+record_cache_failure:
-        revoke_record_cache = NULL;
+        return -ENOMEM;
-        kmem_cache_destroy(revoke_table_cache);
-        revoke_table_cache = NULL;
 }
-/* Initialise the revoke table for a given journal to a given size. */
+static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
-int journal_init_revoke(journal_t *journal, int hash_size)
 {
-        int shift, tmp;
+        int shift = 0;
+        int tmp = hash_size;
+        struct jbd_revoke_table_s *table;
-        J_ASSERT (journal->j_revoke_table[0] == NULL);
+        table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
+        if (!table)
+                goto out;
-        shift = 0;
-        tmp = hash_size;
        while((tmp >>= 1UL) != 0UL)
                shift++;
-        journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
+        table->hash_size = hash_size;
-        if (!journal->j_revoke_table[0])
+        table->hash_shift = shift;
-                return -ENOMEM;
+        table->hash_table =
-        journal->j_revoke = journal->j_revoke_table[0];
-        /* Check that the hash_size is a power of two */
-        J_ASSERT(is_power_of_2(hash_size));
-        journal->j_revoke->hash_size = hash_size;
-        journal->j_revoke->hash_shift = shift;
-        journal->j_revoke->hash_table =
                kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
-        if (!journal->j_revoke->hash_table) {
+        if (!table->hash_table) {
-                kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
+                kmem_cache_free(revoke_table_cache, table);
-                journal->j_revoke = NULL;
+                table = NULL;
-                return -ENOMEM;
+                goto out;
        }
        for (tmp = 0; tmp < hash_size; tmp++)
-                INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+                INIT_LIST_HEAD(&table->hash_table[tmp]);
-        journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
+out:
-        if (!journal->j_revoke_table[1]) {
+        return table;
-                kfree(journal->j_revoke_table[0]->hash_table);
+}
-                kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
-                return -ENOMEM;
+static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table)
+{
+        int i;
+        struct list_head *hash_list;
+        for (i = 0; i < table->hash_size; i++) {
+                hash_list = &table->hash_table[i];
+                J_ASSERT(list_empty(hash_list));
        }
-        journal->j_revoke = journal->j_revoke_table[1];
+        kfree(table->hash_table);
+        kmem_cache_free(revoke_table_cache, table);
+}
-        /* Check that the hash_size is a power of two */
+/* Initialise the revoke table for a given journal to a given size. */
+int journal_init_revoke(journal_t *journal, int hash_size)
+{
+        J_ASSERT(journal->j_revoke_table[0] == NULL);
        J_ASSERT(is_power_of_2(hash_size));
-        journal->j_revoke->hash_size = hash_size;
+        journal->j_revoke_table[0] = journal_init_revoke_table(hash_size);
+        if (!journal->j_revoke_table[0])
+                goto fail0;
-        journal->j_revoke->hash_shift = shift;
+        journal->j_revoke_table[1] = journal_init_revoke_table(hash_size);
+        if (!journal->j_revoke_table[1])
+                goto fail1;
-        journal->j_revoke->hash_table =
+        journal->j_revoke = journal->j_revoke_table[1];
-                kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
-        if (!journal->j_revoke->hash_table) {
-                kfree(journal->j_revoke_table[0]->hash_table);
-                kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
-                kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]);
-                journal->j_revoke = NULL;
-                return -ENOMEM;
-        }
-        for (tmp = 0; tmp < hash_size; tmp++)
-                INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
        spin_lock_init(&journal->j_revoke_lock);
        return 0;
-}
-/* Destoy a journal's revoke table.  The table must already be empty! */
+fail1:
+        journal_destroy_revoke_table(journal->j_revoke_table[0]);
+fail0:
+        return -ENOMEM;
+}
+/* Destroy a journal's revoke table.  The table must already be empty! */
 void journal_destroy_revoke(journal_t *journal)
 {
-        struct jbd_revoke_table_s *table;
-        struct list_head *hash_list;
-        int i;
-        table = journal->j_revoke_table[0];
-        if (!table)
-                return;
-        for (i=0; i<table->hash_size; i++) {
-                hash_list = &table->hash_table[i];
-                J_ASSERT (list_empty(hash_list));
-        }
-        kfree(table->hash_table);
-        kmem_cache_free(revoke_table_cache, table);
-        journal->j_revoke = NULL;
-        table = journal->j_revoke_table[1];
-        if (!table)
-                return;
-        for (i=0; i<table->hash_size; i++) {
-                hash_list = &table->hash_table[i];
-                J_ASSERT (list_empty(hash_list));
-        }
-        kfree(table->hash_table);
-        kmem_cache_free(revoke_table_cache, table);
        journal->j_revoke = NULL;
+        if (journal->j_revoke_table[0])
+                journal_destroy_revoke_table(journal->j_revoke_table[0]);
+        if (journal->j_revoke_table[1])
+                journal_destroy_revoke_table(journal->j_revoke_table[1]);
 }
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 67ff2024c23c..d15cd6e7251e 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -291,7 +291,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
                goto out;
        }
-        lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&handle->h_lockdep_map);
 out:
        return handle;
@@ -954,9 +954,10 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
        journal_t *journal = handle->h_transaction->t_journal;
        int need_brelse = 0;
        struct journal_head *jh;
+        int ret = 0;
        if (is_handle_aborted(handle))
-                return 0;
+                return ret;
        jh = journal_add_journal_head(bh);
        JBUFFER_TRACE(jh, "entry");
@@ -1067,7 +1068,16 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
                                   time if it is redirtied */
                        }
-                        /* journal_clean_data_list() may have got there first */
+                        /*
+                         * We cannot remove the buffer with io error from the
+                         * committing transaction, because otherwise it would
+                         * miss the error and the commit would not abort.
+                         */
+                        if (unlikely(!buffer_uptodate(bh))) {
+                                ret = -EIO;
+                                goto no_journal;
+                        }
                        if (jh->b_transaction != NULL) {
                                JBUFFER_TRACE(jh, "unfile from commit");
                                __journal_temp_unlink_buffer(jh);
@@ -1108,7 +1118,7 @@ no_journal:
        }
        JBUFFER_TRACE(jh, "exit");
        journal_put_journal_head(jh);
-        return 0;
+        return ret;
 }
 /**
@@ -1448,7 +1458,7 @@ int journal_stop(handle_t *handle)
                spin_unlock(&journal->j_state_lock);
        }
-        lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&handle->h_lockdep_map);
        jbd_free_handle(handle);
        return err;
@@ -1648,12 +1658,42 @@ out:
        return;
 }
+/*
+ * journal_try_to_free_buffers() could race with journal_commit_transaction()
+ * The latter might still hold the a count on buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * tryinf to free that buffer.
+ *
+ * Called with journal->j_state_lock held.
+ */
+static void journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+        transaction_t *transaction = NULL;
+        tid_t tid;
+        spin_lock(&journal->j_state_lock);
+        transaction = journal->j_committing_transaction;
+        if (!transaction) {
+                spin_unlock(&journal->j_state_lock);
+                return;
+        }
+        tid = transaction->t_tid;
+        spin_unlock(&journal->j_state_lock);
+        log_wait_commit(journal, tid);
+}
 /**
 * int journal_try_to_free_buffers() - try to free page buffers.
 * @journal: journal for operation
 * @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
 *
 *
 * For all the buffers on this page,
@@ -1682,9 +1722,11 @@ out:
 * journal_try_to_free_buffer() is changing its state.  But that
 * cannot happen because we never reallocate freed data as metadata
 * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
 */
 int journal_try_to_free_buffers(journal_t *journal,
-                                struct page *page, gfp_t unused_gfp_mask)
+                                struct page *page, gfp_t gfp_mask)
 {
        struct buffer_head *head;
        struct buffer_head *bh;
@@ -1713,7 +1755,28 @@ int journal_try_to_free_buffers(journal_t *journal,
                if (buffer_jbd(bh))
                        goto busy;
        } while ((bh = bh->b_this_page) != head);
        ret = try_to_free_buffers(page);
+        /*
+         * There are a number of places where journal_try_to_free_buffers()
+         * could race with journal_commit_transaction(), the later still
+         * holds the reference to the buffers to free while processing them.
+         * try_to_free_buffers() failed to free those buffers. Some of the
+         * caller of releasepage() request page buffers to be dropped, otherwise
+         * treat the fail-to-free as errors (such as generic_file_direct_IO())
+         *
+         * So, if the caller of try_to_release_page() wants the synchronous
+         * behaviour(i.e make sure buffers are dropped upon return),
+         * let's wait for the current transaction to finish flush of
+         * dirty data buffers, then try to free those buffers again,
+         * with the journal locked.
+         */
+        if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+                journal_wait_for_transaction_sync_data(journal);
+                ret = try_to_free_buffers(page);
+        }
 busy:
        return ret;
 }
author	Ingo Molnar <mingo@elte.hu>	2008-10-28 11:26:12 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-10-28 11:26:12 -0400
commit	7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch)
tree	e730a4565e0318140d2fbd2f0415d18a339d7336 /fs/jbd
parent	41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff)
parent	0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff)