jbd: improve fsync batching

There is a flaw with the way jbd handles fsync batching. If we fsync() a file and we were not the last person to run fsync() on this fs then we automatically sleep for 1 jiffie in order to wait for new writers to join into the transaction before forcing the commit. The problem with this is that with really fast storage (ie a Clariion) the time it takes to commit a transaction to disk is way faster than 1 jiffie in most cases, so sleeping means waiting longer with nothing to do than if we just committed the transaction and kept going. Ric Wheeler noticed this when using fs_mark with more than 1 thread, the throughput would plummet as he added more threads. This patch attempts to fix this problem by recording the average time in nanoseconds that it takes to commit a transaction to disk, and what time we started the transaction. If we run an fsync() and we have been running for less time than it takes to commit the transaction to disk, we sleep for the delta amount of time and then commit to disk. We acheive sub-jiffie sleeping using schedule_hrtimeout. This means that the wait time is auto-tuned to the speed of the underlying disk, instead of having this static timeout. I weighted the average according to somebody's comments (Andreas Dilger I think) in order to help normalize random outliers where we take way longer or way less time to commit than the average. I also have a min() check in there to make sure we don't sleep longer than a jiffie in case our storage is super slow, this was requested by Andrew. I unfortunately do not have access to a Clariion, so I had to use a ramdisk to represent a super fast array. I tested with a SATA drive with barrier=1 to make sure there was no regression with local disks, I tested with a 4 way multipathed Apple Xserve RAID array and of course the ramdisk. I ran the following command fs_mark -d /mnt/ext3-test -s 4096 -n 2000 -D 64 -t $i where $i was 2, 4, 8, 16 and 32. I mkfs'ed the fs each time. Here are my results type threads with patch without patch sata 2 24.6 26.3 sata 4 49.2 48.1 sata 8 70.1 67.0 sata 16 104.0 94.1 sata 32 153.6 142.7 xserve 2 246.4 222.0 xserve 4 480.0 440.8 xserve 8 829.5 730.8 xserve 16 1172.7 1026.9 xserve 32 1816.3 1650.5 ramdisk 2 2538.3 1745.6 ramdisk 4 2942.3 661.9 ramdisk 8 2882.5 999.8 ramdisk 16 2738.7 1801.9 ramdisk 32 2541.9 2394.0 Signed-off-by: Josef Bacik <jbacik@redhat.com> Cc: Andreas Dilger <adilger@sun.com> Cc: Arjan van de Ven <arjan@infradead.org> Cc: Ric Wheeler <rwheeler@redhat.com> Cc: <linux-ext4@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Josef Bacik <jbacik@redhat.com> 2009-01-07 21:07:24 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-01-08 11:31:00 -0500
commit: f420d4dc4272fd223986762df2ad06056ddebada (patch)
tree: 2ae50476e901dc5c2e5d189d44785e27234bcce9
parent: ef8b646183868b2d042fa6cde0eef2a31263ff85 (diff)
3 files changed, 63 insertions, 5 deletions
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 25719d902c51..3fbffb1ea714 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
        int flags;
        int err;
        unsigned long blocknr;
+        ktime_t start_time;
+        u64 commit_time;
        char *tagp = NULL;
        journal_header_t *header;
        journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
        commit_transaction->t_state = T_FLUSH;
        journal->j_committing_transaction = commit_transaction;
        journal->j_running_transaction = NULL;
+        start_time = ktime_get();
        commit_transaction->t_log_start = journal->j_head;
        wake_up(&journal->j_wait_transaction_locked);
        spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
        journal->j_commit_sequence = commit_transaction->t_tid;
        journal->j_committing_transaction = NULL;
+        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+        /*
+         * weight the commit time higher than the average time so we don't
+         * react too strongly to vast changes in commit time
+         */
+        if (likely(journal->j_average_commit_time))
+                journal->j_average_commit_time = (commit_time*3 +
+                                journal->j_average_commit_time) / 4;
+        else
+                journal->j_average_commit_time = commit_time;
        spin_unlock(&journal->j_state_lock);
        if (commit_transaction->t_checkpoint_list == NULL &&
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d4c32c8808..b51fbd4b2913 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
@@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
 {
        transaction->t_journal = journal;
        transaction->t_state = T_RUNNING;
+        transaction->t_start_time = ktime_get();
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
@@ -1370,7 +1372,7 @@ int journal_stop(handle_t *handle)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-        int old_handle_count, err;
+        int err;
        pid_t pid;
        J_ASSERT(journal_current_handle() == handle);
@@ -1399,6 +1401,17 @@ int journal_stop(handle_t *handle)
         * on IO anyway.  Speeds up many-threaded, many-dir operations
         * by 30x or more...
         *
+         * We try and optimize the sleep time against what the underlying disk
+         * can do, instead of having a static sleep time.  This is usefull for
+         * the case where our storage is so fast that it is more optimal to go
+         * ahead and force a flush and wait for the transaction to be committed
+         * than it is to wait for an arbitrary amount of time for new writers to
+         * join the transaction.  We acheive this by measuring how long it takes
+         * to commit a transaction, and compare it with how long this
+         * transaction has been running, and if run time < commit time then we
+         * sleep for the delta and commit.  This greatly helps super fast disks
+         * that would see slowdowns as more threads started doing fsyncs.
+         *
         * But don't do this if this process was the most recent one to
         * perform a synchronous write.  We do this to detect the case where a
         * single process is doing a stream of sync writes.  No point in waiting
@@ -1406,11 +1419,26 @@ int journal_stop(handle_t *handle)
         */
        pid = current->pid;
        if (handle->h_sync && journal->j_last_sync_writer != pid) {
+                u64 commit_time, trans_time;
                journal->j_last_sync_writer = pid;
-                do {
-                        old_handle_count = transaction->t_handle_count;
+                spin_lock(&journal->j_state_lock);
-                        schedule_timeout_uninterruptible(1);
+                commit_time = journal->j_average_commit_time;
-                } while (old_handle_count != transaction->t_handle_count);
+                spin_unlock(&journal->j_state_lock);
+                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+                                                   transaction->t_start_time));
+                commit_time = min_t(u64, commit_time,
+                                    1000*jiffies_to_usecs(1));
+                if (trans_time < commit_time) {
+                        ktime_t expires = ktime_add_ns(ktime_get(),
+                                                       commit_time);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+                }
        }
        current->journal_info = NULL;
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 346e2b80be7d..6384b19efe64 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -543,6 +543,11 @@ struct transaction_s
        unsigned long           t_expires;
        /*
+         * When this transaction started, in nanoseconds [no locking]
+         */
+        ktime_t                 t_start_time;
+        /*
         * How many handles used this transaction? [t_handle_lock]
         */
        int t_handle_count;
@@ -798,9 +803,19 @@ struct journal_s
        struct buffer_head      **j_wbuf;
        int                     j_wbufsize;
+        /*
+         * this is the pid of the last person to run a synchronous operation
+         * through the journal.
+         */
        pid_t                   j_last_sync_writer;
        /*
+         * the average amount of time in nanoseconds it takes to commit a
+         * transaction to the disk.  [j_state_lock]
+         */
+        u64                     j_average_commit_time;
+        /*
         * An opaque pointer to fs-private information.  ext3 puts its
         * superblock pointer here
         */
author	Josef Bacik <jbacik@redhat.com>	2009-01-07 21:07:24 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-01-08 11:31:00 -0500
commit	f420d4dc4272fd223986762df2ad06056ddebada (patch)
tree	2ae50476e901dc5c2e5d189d44785e27234bcce9
parent	ef8b646183868b2d042fa6cde0eef2a31263ff85 (diff)

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 25719d902c51..3fbffb1ea714 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
306	int flags;	306	int flags;
307	int err;	307	int err;
308	unsigned long blocknr;	308	unsigned long blocknr;
		309	ktime_t start_time;
		310	u64 commit_time;
309	char *tagp = NULL;	311	char *tagp = NULL;
310	journal_header_t *header;	312	journal_header_t *header;
311	journal_block_tag_t *tag = NULL;	313	journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
418	commit_transaction->t_state = T_FLUSH;	420	commit_transaction->t_state = T_FLUSH;
419	journal->j_committing_transaction = commit_transaction;	421	journal->j_committing_transaction = commit_transaction;
420	journal->j_running_transaction = NULL;	422	journal->j_running_transaction = NULL;
		423	start_time = ktime_get();
421	commit_transaction->t_log_start = journal->j_head;	424	commit_transaction->t_log_start = journal->j_head;
422	wake_up(&journal->j_wait_transaction_locked);	425	wake_up(&journal->j_wait_transaction_locked);
423	spin_unlock(&journal->j_state_lock);	426	spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
913	J_ASSERT(commit_transaction == journal->j_committing_transaction);	916	J_ASSERT(commit_transaction == journal->j_committing_transaction);
914	journal->j_commit_sequence = commit_transaction->t_tid;	917	journal->j_commit_sequence = commit_transaction->t_tid;
915	journal->j_committing_transaction = NULL;	918	journal->j_committing_transaction = NULL;
		919	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
		920
		921	/*
		922	* weight the commit time higher than the average time so we don't
		923	* react too strongly to vast changes in commit time
		924	*/
		925	if (likely(journal->j_average_commit_time))
		926	journal->j_average_commit_time = (commit_time*3 +
		927	journal->j_average_commit_time) / 4;
		928	else
		929	journal->j_average_commit_time = commit_time;
		930
916	spin_unlock(&journal->j_state_lock);	931	spin_unlock(&journal->j_state_lock);
917		932
918	if (commit_transaction->t_checkpoint_list == NULL &&	933	if (commit_transaction->t_checkpoint_list == NULL &&


diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 60d4c32c8808..b51fbd4b2913 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
25	#include <linux/timer.h>	25	#include <linux/timer.h>
26	#include <linux/mm.h>	26	#include <linux/mm.h>
27	#include <linux/highmem.h>	27	#include <linux/highmem.h>
		28	#include <linux/hrtimer.h>
28		29
29	static void __journal_temp_unlink_buffer(struct journal_head *jh);	30	static void __journal_temp_unlink_buffer(struct journal_head *jh);
30		31
@@ -49,6 +50,7 @@ get_transaction(journal_t journal, transaction_t transaction)
49	{	50	{
50	transaction->t_journal = journal;	51	transaction->t_journal = journal;
51	transaction->t_state = T_RUNNING;	52	transaction->t_state = T_RUNNING;
		53	transaction->t_start_time = ktime_get();
52	transaction->t_tid = journal->j_transaction_sequence++;	54	transaction->t_tid = journal->j_transaction_sequence++;
53	transaction->t_expires = jiffies + journal->j_commit_interval;	55	transaction->t_expires = jiffies + journal->j_commit_interval;
54	spin_lock_init(&transaction->t_handle_lock);	56	spin_lock_init(&transaction->t_handle_lock);
@@ -1370,7 +1372,7 @@ int journal_stop(handle_t *handle)
1370	{	1372	{
1371	transaction_t *transaction = handle->h_transaction;	1373	transaction_t *transaction = handle->h_transaction;
1372	journal_t *journal = transaction->t_journal;	1374	journal_t *journal = transaction->t_journal;
1373	int old_handle_count, err;	1375	int err;
1374	pid_t pid;	1376	pid_t pid;
1375		1377
1376	J_ASSERT(journal_current_handle() == handle);	1378	J_ASSERT(journal_current_handle() == handle);
@@ -1399,6 +1401,17 @@ int journal_stop(handle_t *handle)
1399	* on IO anyway. Speeds up many-threaded, many-dir operations	1401	* on IO anyway. Speeds up many-threaded, many-dir operations
1400	* by 30x or more...	1402	* by 30x or more...
1401	*	1403	*
		1404	* We try and optimize the sleep time against what the underlying disk
		1405	* can do, instead of having a static sleep time. This is usefull for
		1406	* the case where our storage is so fast that it is more optimal to go
		1407	* ahead and force a flush and wait for the transaction to be committed
		1408	* than it is to wait for an arbitrary amount of time for new writers to
		1409	* join the transaction. We acheive this by measuring how long it takes
		1410	* to commit a transaction, and compare it with how long this
		1411	* transaction has been running, and if run time < commit time then we
		1412	* sleep for the delta and commit. This greatly helps super fast disks
		1413	* that would see slowdowns as more threads started doing fsyncs.
		1414	*
1402	* But don't do this if this process was the most recent one to	1415	* But don't do this if this process was the most recent one to
1403	* perform a synchronous write. We do this to detect the case where a	1416	* perform a synchronous write. We do this to detect the case where a
1404	* single process is doing a stream of sync writes. No point in waiting	1417	* single process is doing a stream of sync writes. No point in waiting
@@ -1406,11 +1419,26 @@ int journal_stop(handle_t *handle)
1406	*/	1419	*/
1407	pid = current->pid;	1420	pid = current->pid;
1408	if (handle->h_sync && journal->j_last_sync_writer != pid) {	1421	if (handle->h_sync && journal->j_last_sync_writer != pid) {
		1422	u64 commit_time, trans_time;
		1423
1409	journal->j_last_sync_writer = pid;	1424	journal->j_last_sync_writer = pid;
1410	do {	1425
1411	old_handle_count = transaction->t_handle_count;	1426	spin_lock(&journal->j_state_lock);
1412	schedule_timeout_uninterruptible(1);	1427	commit_time = journal->j_average_commit_time;
1413	} while (old_handle_count != transaction->t_handle_count);	1428	spin_unlock(&journal->j_state_lock);
		1429
		1430	trans_time = ktime_to_ns(ktime_sub(ktime_get(),
		1431	transaction->t_start_time));
		1432
		1433	commit_time = min_t(u64, commit_time,
		1434	1000*jiffies_to_usecs(1));
		1435
		1436	if (trans_time < commit_time) {
		1437	ktime_t expires = ktime_add_ns(ktime_get(),
		1438	commit_time);
		1439	set_current_state(TASK_UNINTERRUPTIBLE);
		1440	schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
		1441	}
1414	}	1442	}
1415		1443
1416	current->journal_info = NULL;	1444	current->journal_info = NULL;


diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 346e2b80be7d..6384b19efe64 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h
@@ -543,6 +543,11 @@ struct transaction_s
543	unsigned long t_expires;	543	unsigned long t_expires;
544		544
545	/*	545	/*
		546	* When this transaction started, in nanoseconds [no locking]
		547	*/
		548	ktime_t t_start_time;
		549
		550	/*
546	* How many handles used this transaction? [t_handle_lock]	551	* How many handles used this transaction? [t_handle_lock]
547	*/	552	*/
548	int t_handle_count;	553	int t_handle_count;
@@ -798,9 +803,19 @@ struct journal_s
798	struct buffer_head **j_wbuf;	803	struct buffer_head **j_wbuf;
799	int j_wbufsize;	804	int j_wbufsize;
800		805
		806	/*
		807	* this is the pid of the last person to run a synchronous operation
		808	* through the journal.
		809	*/
801	pid_t j_last_sync_writer;	810	pid_t j_last_sync_writer;
802		811
803	/*	812	/*
		813	* the average amount of time in nanoseconds it takes to commit a
		814	* transaction to the disk. [j_state_lock]
		815	*/
		816	u64 j_average_commit_time;
		817
		818	/*
804	* An opaque pointer to fs-private information. ext3 puts its	819	* An opaque pointer to fs-private information. ext3 puts its
805	* superblock pointer here	820	* superblock pointer here
806	*/	821	*/