Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (57 commits) jbd2: Fix oops in jbd2_journal_init_inode() on corrupted fs ext4: Remove "extents" mount option block: Add Kconfig help which notes that ext4 needs CONFIG_LBD ext4: Make printk's consistently prefixed with "EXT4-fs: " ext4: Add sanity checks for the superblock before mounting the filesystem ext4: Add mount option to set kjournald's I/O priority jbd2: Submit writes to the journal using WRITE_SYNC jbd2: Add pid and journal device name to the "kjournald2 starting" message ext4: Add markers for better debuggability ext4: Remove code to create the journal inode ext4: provide function to release metadata pages under memory pressure ext3: provide function to release metadata pages under memory pressure add releasepage hooks to block devices which can be used by file systems ext4: Fix s_dirty_blocks_counter if block allocation failed with nodelalloc ext4: Init the complete page while building buddy cache ext4: Don't allow new groups to be added during block allocation ext4: mark the blocks/inode bitmap beyond end of group as used ext4: Use new buffer_head flag to check uninit group bitmaps initialization ext4: Fix the race between read_inode_bitmap() and ext4_new_inode() ext4: code cleanup ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2009-01-08 20:14:59 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-01-08 20:14:59 -0500
commit: 2150edc6c5cf00f7adb54538b9ea2a3e9cedca3f (patch)
tree: f72a0d85e66f500b4cead348a231e3d3b9f357bc /fs/jbd2/transaction.c
parent: cd764695b67386a81964f68e9c66efd9f13f4d29 (diff)
parent: 4b905671d2ea09fd48fed72c581df17e40823f39 (diff)
1 files changed, 46 insertions, 14 deletions
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4f925a4f3d05..46b4e347ed7d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 {
        transaction->t_journal = journal;
        transaction->t_state = T_RUNNING;
+        transaction->t_start_time = ktime_get();
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
@@ -1240,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-        int old_handle_count, err;
+        int err;
        pid_t pid;
        J_ASSERT(journal_current_handle() == handle);
@@ -1263,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle)
        /*
         * Implement synchronous transaction batching.  If the handle
         * was synchronous, don't force a commit immediately.  Let's
-         * yield and let another thread piggyback onto this transaction.
+         * yield and let another thread piggyback onto this
-         * Keep doing that while new threads continue to arrive.
+         * transaction.  Keep doing that while new threads continue to
-         * It doesn't cost much - we're about to run a commit and sleep
+         * arrive.  It doesn't cost much - we're about to run a commit
-         * on IO anyway.  Speeds up many-threaded, many-dir operations
+         * and sleep on IO anyway.  Speeds up many-threaded, many-dir
-         * by 30x or more...
+         * operations by 30x or more...
+         *
+         * We try and optimize the sleep time against what the
+         * underlying disk can do, instead of having a static sleep
+         * time.  This is useful for the case where our storage is so
+         * fast that it is more optimal to go ahead and force a flush
+         * and wait for the transaction to be committed than it is to
+         * wait for an arbitrary amount of time for new writers to
+         * join the transaction.  We achieve this by measuring how
+         * long it takes to commit a transaction, and compare it with
+         * how long this transaction has been running, and if run time
+         * < commit time then we sleep for the delta and commit.  This
+         * greatly helps super fast disks that would see slowdowns as
+         * more threads started doing fsyncs.
         *
-         * But don't do this if this process was the most recent one to
+         * But don't do this if this process was the most recent one
-         * perform a synchronous write.  We do this to detect the case where a
+         * to perform a synchronous write.  We do this to detect the
-         * single process is doing a stream of sync writes.  No point in waiting
+         * case where a single process is doing a stream of sync
-         * for joiners in that case.
+         * writes.  No point in waiting for joiners in that case.
         */
        pid = current->pid;
        if (handle->h_sync && journal->j_last_sync_writer != pid) {
+                u64 commit_time, trans_time;
                journal->j_last_sync_writer = pid;
-                do {
-                        old_handle_count = transaction->t_handle_count;
+                spin_lock(&journal->j_state_lock);
-                        schedule_timeout_uninterruptible(1);
+                commit_time = journal->j_average_commit_time;
-                } while (old_handle_count != transaction->t_handle_count);
+                spin_unlock(&journal->j_state_lock);
+                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+                                                   transaction->t_start_time));
+                commit_time = max_t(u64, commit_time,
+                                    1000*journal->j_min_batch_time);
+                commit_time = min_t(u64, commit_time,
+                                    1000*journal->j_max_batch_time);
+                if (trans_time < commit_time) {
+                        ktime_t expires = ktime_add_ns(ktime_get(),
+                                                       commit_time);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+                }
        }
        current->journal_info = NULL;
author	Linus Torvalds <torvalds@linux-foundation.org>	2009-01-08 20:14:59 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-01-08 20:14:59 -0500
commit	2150edc6c5cf00f7adb54538b9ea2a3e9cedca3f (patch)
tree	f72a0d85e66f500b4cead348a231e3d3b9f357bc /fs/jbd2/transaction.c
parent	cd764695b67386a81964f68e9c66efd9f13f4d29 (diff)
parent	4b905671d2ea09fd48fed72c581df17e40823f39 (diff)

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 4f925a4f3d05..46b4e347ed7d 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
25	#include <linux/timer.h>	25	#include <linux/timer.h>
26	#include <linux/mm.h>	26	#include <linux/mm.h>
27	#include <linux/highmem.h>	27	#include <linux/highmem.h>
		28	#include <linux/hrtimer.h>
28		29
29	static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);	30	static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
30		31
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t journal, transaction_t transaction)
48	{	49	{
49	transaction->t_journal = journal;	50	transaction->t_journal = journal;
50	transaction->t_state = T_RUNNING;	51	transaction->t_state = T_RUNNING;
		52	transaction->t_start_time = ktime_get();
51	transaction->t_tid = journal->j_transaction_sequence++;	53	transaction->t_tid = journal->j_transaction_sequence++;
52	transaction->t_expires = jiffies + journal->j_commit_interval;	54	transaction->t_expires = jiffies + journal->j_commit_interval;
53	spin_lock_init(&transaction->t_handle_lock);	55	spin_lock_init(&transaction->t_handle_lock);
@@ -1240,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle)
1240	{	1242	{
1241	transaction_t *transaction = handle->h_transaction;	1243	transaction_t *transaction = handle->h_transaction;
1242	journal_t *journal = transaction->t_journal;	1244	journal_t *journal = transaction->t_journal;
1243	int old_handle_count, err;	1245	int err;
1244	pid_t pid;	1246	pid_t pid;
1245		1247
1246	J_ASSERT(journal_current_handle() == handle);	1248	J_ASSERT(journal_current_handle() == handle);
@@ -1263,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle)
1263	/*	1265	/*
1264	* Implement synchronous transaction batching. If the handle	1266	* Implement synchronous transaction batching. If the handle
1265	* was synchronous, don't force a commit immediately. Let's	1267	* was synchronous, don't force a commit immediately. Let's
1266	* yield and let another thread piggyback onto this transaction.	1268	* yield and let another thread piggyback onto this
1267	* Keep doing that while new threads continue to arrive.	1269	* transaction. Keep doing that while new threads continue to
1268	* It doesn't cost much - we're about to run a commit and sleep	1270	* arrive. It doesn't cost much - we're about to run a commit
1269	* on IO anyway. Speeds up many-threaded, many-dir operations	1271	* and sleep on IO anyway. Speeds up many-threaded, many-dir
1270	* by 30x or more...	1272	* operations by 30x or more...
		1273	*
		1274	* We try and optimize the sleep time against what the
		1275	* underlying disk can do, instead of having a static sleep
		1276	* time. This is useful for the case where our storage is so
		1277	* fast that it is more optimal to go ahead and force a flush
		1278	* and wait for the transaction to be committed than it is to
		1279	* wait for an arbitrary amount of time for new writers to
		1280	* join the transaction. We achieve this by measuring how
		1281	* long it takes to commit a transaction, and compare it with
		1282	* how long this transaction has been running, and if run time
		1283	* < commit time then we sleep for the delta and commit. This
		1284	* greatly helps super fast disks that would see slowdowns as
		1285	* more threads started doing fsyncs.
1271	*	1286	*
1272	* But don't do this if this process was the most recent one to	1287	* But don't do this if this process was the most recent one
1273	* perform a synchronous write. We do this to detect the case where a	1288	* to perform a synchronous write. We do this to detect the
1274	* single process is doing a stream of sync writes. No point in waiting	1289	* case where a single process is doing a stream of sync
1275	* for joiners in that case.	1290	* writes. No point in waiting for joiners in that case.
1276	*/	1291	*/
1277	pid = current->pid;	1292	pid = current->pid;
1278	if (handle->h_sync && journal->j_last_sync_writer != pid) {	1293	if (handle->h_sync && journal->j_last_sync_writer != pid) {
		1294	u64 commit_time, trans_time;
		1295
1279	journal->j_last_sync_writer = pid;	1296	journal->j_last_sync_writer = pid;
1280	do {	1297
1281	old_handle_count = transaction->t_handle_count;	1298	spin_lock(&journal->j_state_lock);
1282	schedule_timeout_uninterruptible(1);	1299	commit_time = journal->j_average_commit_time;
1283	} while (old_handle_count != transaction->t_handle_count);	1300	spin_unlock(&journal->j_state_lock);
		1301
		1302	trans_time = ktime_to_ns(ktime_sub(ktime_get(),
		1303	transaction->t_start_time));
		1304
		1305	commit_time = max_t(u64, commit_time,
		1306	1000*journal->j_min_batch_time);
		1307	commit_time = min_t(u64, commit_time,
		1308	1000*journal->j_max_batch_time);
		1309
		1310	if (trans_time < commit_time) {
		1311	ktime_t expires = ktime_add_ns(ktime_get(),
		1312	commit_time);
		1313	set_current_state(TASK_UNINTERRUPTIBLE);
		1314	schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
		1315	}
1284	}	1316	}
1285		1317
1286	current->journal_info = NULL;	1318	current->journal_info = NULL;