aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@redhat.com>2008-11-26 01:14:26 -0500
committerTheodore Ts'o <tytso@mit.edu>2008-11-26 01:14:26 -0500
commite07f7183a486cf9783d1f8c9d2997b5b39eeb2d4 (patch)
tree74ed3a563add5fa57e80af03f3f712f2910ac39f
parent032115fcef837a00336ddf7bda584e89789ea498 (diff)
jbd2: improve jbd2 fsync batching
This patch removes the static sleep time in favor of a more self optimizing approach where we measure the average amount of time it takes to commit a transaction to disk and the ammount of time a transaction has been running. If somebody does a sync write or an fsync() traditionally we would sleep for 1 jiffies, which depending on the value of HZ could be a significant amount of time compared to how long it takes to commit a transaction to the underlying storage. With this patch instead of sleeping for a jiffie, we check to see if the amount of time this transaction has been running is less than the average commit time, and if it is we sleep for the delta using schedule_hrtimeout to give us a higher precision sleep time. This greatly benefits high end storage where you could end up sleeping for longer than it takes to commit the transaction and therefore sitting idle instead of allowing the transaction to be committed by keeping the sleep time to a minimum so you are sure to always be doing something. Signed-off-by: Josef Bacik <jbacik@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/jbd2/commit.c14
-rw-r--r--fs/jbd2/transaction.c58
-rw-r--r--include/linux/jbd2.h15
3 files changed, 73 insertions, 14 deletions
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6393fd0d804e..f22d1828ea85 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -355,6 +355,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
355 int flags; 355 int flags;
356 int err; 356 int err;
357 unsigned long long blocknr; 357 unsigned long long blocknr;
358 ktime_t start_time;
359 u64 commit_time;
358 char *tagp = NULL; 360 char *tagp = NULL;
359 journal_header_t *header; 361 journal_header_t *header;
360 journal_block_tag_t *tag = NULL; 362 journal_block_tag_t *tag = NULL;
@@ -481,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
481 commit_transaction->t_state = T_FLUSH; 483 commit_transaction->t_state = T_FLUSH;
482 journal->j_committing_transaction = commit_transaction; 484 journal->j_committing_transaction = commit_transaction;
483 journal->j_running_transaction = NULL; 485 journal->j_running_transaction = NULL;
486 start_time = ktime_get();
484 commit_transaction->t_log_start = journal->j_head; 487 commit_transaction->t_log_start = journal->j_head;
485 wake_up(&journal->j_wait_transaction_locked); 488 wake_up(&journal->j_wait_transaction_locked);
486 spin_unlock(&journal->j_state_lock); 489 spin_unlock(&journal->j_state_lock);
@@ -995,6 +998,17 @@ restart_loop:
995 J_ASSERT(commit_transaction == journal->j_committing_transaction); 998 J_ASSERT(commit_transaction == journal->j_committing_transaction);
996 journal->j_commit_sequence = commit_transaction->t_tid; 999 journal->j_commit_sequence = commit_transaction->t_tid;
997 journal->j_committing_transaction = NULL; 1000 journal->j_committing_transaction = NULL;
1001 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1002
1003 /*
1004 * weight the commit time higher than the average time so we don't
1005 * react too strongly to vast changes in the commit time
1006 */
1007 if (likely(journal->j_average_commit_time))
1008 journal->j_average_commit_time = (commit_time +
1009 journal->j_average_commit_time*3) / 4;
1010 else
1011 journal->j_average_commit_time = commit_time;
998 spin_unlock(&journal->j_state_lock); 1012 spin_unlock(&journal->j_state_lock);
999 1013
1000 if (journal->j_commit_callback) 1014 if (journal->j_commit_callback)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 39b7805a599a..13dcbc990f41 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
25#include <linux/timer.h> 25#include <linux/timer.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h>
28 29
29static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 30static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
30 31
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
48{ 49{
49 transaction->t_journal = journal; 50 transaction->t_journal = journal;
50 transaction->t_state = T_RUNNING; 51 transaction->t_state = T_RUNNING;
52 transaction->t_start_time = ktime_get();
51 transaction->t_tid = journal->j_transaction_sequence++; 53 transaction->t_tid = journal->j_transaction_sequence++;
52 transaction->t_expires = jiffies + journal->j_commit_interval; 54 transaction->t_expires = jiffies + journal->j_commit_interval;
53 spin_lock_init(&transaction->t_handle_lock); 55 spin_lock_init(&transaction->t_handle_lock);
@@ -1193,7 +1195,7 @@ int jbd2_journal_stop(handle_t *handle)
1193{ 1195{
1194 transaction_t *transaction = handle->h_transaction; 1196 transaction_t *transaction = handle->h_transaction;
1195 journal_t *journal = transaction->t_journal; 1197 journal_t *journal = transaction->t_journal;
1196 int old_handle_count, err; 1198 int err;
1197 pid_t pid; 1199 pid_t pid;
1198 1200
1199 J_ASSERT(journal_current_handle() == handle); 1201 J_ASSERT(journal_current_handle() == handle);
@@ -1216,24 +1218,52 @@ int jbd2_journal_stop(handle_t *handle)
1216 /* 1218 /*
1217 * Implement synchronous transaction batching. If the handle 1219 * Implement synchronous transaction batching. If the handle
1218 * was synchronous, don't force a commit immediately. Let's 1220 * was synchronous, don't force a commit immediately. Let's
1219 * yield and let another thread piggyback onto this transaction. 1221 * yield and let another thread piggyback onto this
1220 * Keep doing that while new threads continue to arrive. 1222 * transaction. Keep doing that while new threads continue to
1221 * It doesn't cost much - we're about to run a commit and sleep 1223 * arrive. It doesn't cost much - we're about to run a commit
1222 * on IO anyway. Speeds up many-threaded, many-dir operations 1224 * and sleep on IO anyway. Speeds up many-threaded, many-dir
1223 * by 30x or more... 1225 * operations by 30x or more...
1226 *
1227 * We try and optimize the sleep time against what the
1228 * underlying disk can do, instead of having a static sleep
1229 * time. This is useful for the case where our storage is so
1230 * fast that it is more optimal to go ahead and force a flush
1231 * and wait for the transaction to be committed than it is to
1232 * wait for an arbitrary amount of time for new writers to
1233 * join the transaction. We achieve this by measuring how
1234 * long it takes to commit a transaction, and compare it with
1235 * how long this transaction has been running, and if run time
1236 * < commit time then we sleep for the delta and commit. This
1237 * greatly helps super fast disks that would see slowdowns as
1238 * more threads started doing fsyncs.
1224 * 1239 *
1225 * But don't do this if this process was the most recent one to 1240 * But don't do this if this process was the most recent one
1226 * perform a synchronous write. We do this to detect the case where a 1241 * to perform a synchronous write. We do this to detect the
1227 * single process is doing a stream of sync writes. No point in waiting 1242 * case where a single process is doing a stream of sync
1228 * for joiners in that case. 1243 * writes. No point in waiting for joiners in that case.
1229 */ 1244 */
1230 pid = current->pid; 1245 pid = current->pid;
1231 if (handle->h_sync && journal->j_last_sync_writer != pid) { 1246 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1247 u64 commit_time, trans_time;
1248
1232 journal->j_last_sync_writer = pid; 1249 journal->j_last_sync_writer = pid;
1233 do { 1250
1234 old_handle_count = transaction->t_handle_count; 1251 spin_lock(&journal->j_state_lock);
1235 schedule_timeout_uninterruptible(1); 1252 commit_time = journal->j_average_commit_time;
1236 } while (old_handle_count != transaction->t_handle_count); 1253 spin_unlock(&journal->j_state_lock);
1254
1255 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1256 transaction->t_start_time));
1257
1258 commit_time = min_t(u64, commit_time,
1259 1000*jiffies_to_usecs(1));
1260
1261 if (trans_time < commit_time) {
1262 ktime_t expires = ktime_add_ns(ktime_get(),
1263 commit_time);
1264 set_current_state(TASK_UNINTERRUPTIBLE);
1265 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1266 }
1237 } 1267 }
1238 1268
1239 current->journal_info = NULL; 1269 current->journal_info = NULL;
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f36645745489..ab8cef130c28 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -638,6 +638,11 @@ struct transaction_s
638 unsigned long t_expires; 638 unsigned long t_expires;
639 639
640 /* 640 /*
641 * When this transaction started, in nanoseconds [no locking]
642 */
643 ktime_t t_start_time;
644
645 /*
641 * How many handles used this transaction? [t_handle_lock] 646 * How many handles used this transaction? [t_handle_lock]
642 */ 647 */
643 int t_handle_count; 648 int t_handle_count;
@@ -939,8 +944,18 @@ struct journal_s
939 struct buffer_head **j_wbuf; 944 struct buffer_head **j_wbuf;
940 int j_wbufsize; 945 int j_wbufsize;
941 946
947 /*
948 * this is the pid of hte last person to run a synchronous operation
949 * through the journal
950 */
942 pid_t j_last_sync_writer; 951 pid_t j_last_sync_writer;
943 952
953 /*
954 * the average amount of time in nanoseconds it takes to commit a
955 * transaction to disk. [j_state_lock]
956 */
957 u64 j_average_commit_time;
958
944 /* This function is called when a transaction is closed */ 959 /* This function is called when a transaction is closed */
945 void (*j_commit_callback)(journal_t *, 960 void (*j_commit_callback)(journal_t *,
946 transaction_t *); 961 transaction_t *);