aboutsummaryrefslogtreecommitdiffstats
path: root/fs/jbd/commit.c
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@redhat.com>2009-01-07 21:07:24 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-01-08 11:31:00 -0500
commitf420d4dc4272fd223986762df2ad06056ddebada (patch)
tree2ae50476e901dc5c2e5d189d44785e27234bcce9 /fs/jbd/commit.c
parentef8b646183868b2d042fa6cde0eef2a31263ff85 (diff)
jbd: improve fsync batching
There is a flaw with the way jbd handles fsync batching. If we fsync() a file and we were not the last person to run fsync() on this fs then we automatically sleep for 1 jiffie in order to wait for new writers to join into the transaction before forcing the commit. The problem with this is that with really fast storage (ie a Clariion) the time it takes to commit a transaction to disk is way faster than 1 jiffie in most cases, so sleeping means waiting longer with nothing to do than if we just committed the transaction and kept going. Ric Wheeler noticed this when using fs_mark with more than 1 thread, the throughput would plummet as he added more threads. This patch attempts to fix this problem by recording the average time in nanoseconds that it takes to commit a transaction to disk, and what time we started the transaction. If we run an fsync() and we have been running for less time than it takes to commit the transaction to disk, we sleep for the delta amount of time and then commit to disk. We acheive sub-jiffie sleeping using schedule_hrtimeout. This means that the wait time is auto-tuned to the speed of the underlying disk, instead of having this static timeout. I weighted the average according to somebody's comments (Andreas Dilger I think) in order to help normalize random outliers where we take way longer or way less time to commit than the average. I also have a min() check in there to make sure we don't sleep longer than a jiffie in case our storage is super slow, this was requested by Andrew. I unfortunately do not have access to a Clariion, so I had to use a ramdisk to represent a super fast array. I tested with a SATA drive with barrier=1 to make sure there was no regression with local disks, I tested with a 4 way multipathed Apple Xserve RAID array and of course the ramdisk. I ran the following command fs_mark -d /mnt/ext3-test -s 4096 -n 2000 -D 64 -t $i where $i was 2, 4, 8, 16 and 32. I mkfs'ed the fs each time. Here are my results type threads with patch without patch sata 2 24.6 26.3 sata 4 49.2 48.1 sata 8 70.1 67.0 sata 16 104.0 94.1 sata 32 153.6 142.7 xserve 2 246.4 222.0 xserve 4 480.0 440.8 xserve 8 829.5 730.8 xserve 16 1172.7 1026.9 xserve 32 1816.3 1650.5 ramdisk 2 2538.3 1745.6 ramdisk 4 2942.3 661.9 ramdisk 8 2882.5 999.8 ramdisk 16 2738.7 1801.9 ramdisk 32 2541.9 2394.0 Signed-off-by: Josef Bacik <jbacik@redhat.com> Cc: Andreas Dilger <adilger@sun.com> Cc: Arjan van de Ven <arjan@infradead.org> Cc: Ric Wheeler <rwheeler@redhat.com> Cc: <linux-ext4@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/jbd/commit.c')
-rw-r--r--fs/jbd/commit.c15
1 files changed, 15 insertions, 0 deletions
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 25719d902c51..3fbffb1ea714 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
306 int flags; 306 int flags;
307 int err; 307 int err;
308 unsigned long blocknr; 308 unsigned long blocknr;
309 ktime_t start_time;
310 u64 commit_time;
309 char *tagp = NULL; 311 char *tagp = NULL;
310 journal_header_t *header; 312 journal_header_t *header;
311 journal_block_tag_t *tag = NULL; 313 journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
418 commit_transaction->t_state = T_FLUSH; 420 commit_transaction->t_state = T_FLUSH;
419 journal->j_committing_transaction = commit_transaction; 421 journal->j_committing_transaction = commit_transaction;
420 journal->j_running_transaction = NULL; 422 journal->j_running_transaction = NULL;
423 start_time = ktime_get();
421 commit_transaction->t_log_start = journal->j_head; 424 commit_transaction->t_log_start = journal->j_head;
422 wake_up(&journal->j_wait_transaction_locked); 425 wake_up(&journal->j_wait_transaction_locked);
423 spin_unlock(&journal->j_state_lock); 426 spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
913 J_ASSERT(commit_transaction == journal->j_committing_transaction); 916 J_ASSERT(commit_transaction == journal->j_committing_transaction);
914 journal->j_commit_sequence = commit_transaction->t_tid; 917 journal->j_commit_sequence = commit_transaction->t_tid;
915 journal->j_committing_transaction = NULL; 918 journal->j_committing_transaction = NULL;
919 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
920
921 /*
922 * weight the commit time higher than the average time so we don't
923 * react too strongly to vast changes in commit time
924 */
925 if (likely(journal->j_average_commit_time))
926 journal->j_average_commit_time = (commit_time*3 +
927 journal->j_average_commit_time) / 4;
928 else
929 journal->j_average_commit_time = commit_time;
930
916 spin_unlock(&journal->j_state_lock); 931 spin_unlock(&journal->j_state_lock);
917 932
918 if (commit_transaction->t_checkpoint_list == NULL && 933 if (commit_transaction->t_checkpoint_list == NULL &&