diff options
author | Josef Bacik <jbacik@redhat.com> | 2008-11-26 01:14:26 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-11-26 01:14:26 -0500 |
commit | e07f7183a486cf9783d1f8c9d2997b5b39eeb2d4 (patch) | |
tree | 74ed3a563add5fa57e80af03f3f712f2910ac39f | |
parent | 032115fcef837a00336ddf7bda584e89789ea498 (diff) |
jbd2: improve jbd2 fsync batching
This patch removes the static sleep time in favor of a more self
optimizing approach where we measure the average amount of time it
takes to commit a transaction to disk and the ammount of time a
transaction has been running. If somebody does a sync write or an
fsync() traditionally we would sleep for 1 jiffies, which depending on
the value of HZ could be a significant amount of time compared to how
long it takes to commit a transaction to the underlying storage. With
this patch instead of sleeping for a jiffie, we check to see if the
amount of time this transaction has been running is less than the
average commit time, and if it is we sleep for the delta using
schedule_hrtimeout to give us a higher precision sleep time. This
greatly benefits high end storage where you could end up sleeping for
longer than it takes to commit the transaction and therefore sitting
idle instead of allowing the transaction to be committed by keeping
the sleep time to a minimum so you are sure to always be doing
something.
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r-- | fs/jbd2/commit.c | 14 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 58 | ||||
-rw-r--r-- | include/linux/jbd2.h | 15 |
3 files changed, 73 insertions, 14 deletions
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6393fd0d804e..f22d1828ea85 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -355,6 +355,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
355 | int flags; | 355 | int flags; |
356 | int err; | 356 | int err; |
357 | unsigned long long blocknr; | 357 | unsigned long long blocknr; |
358 | ktime_t start_time; | ||
359 | u64 commit_time; | ||
358 | char *tagp = NULL; | 360 | char *tagp = NULL; |
359 | journal_header_t *header; | 361 | journal_header_t *header; |
360 | journal_block_tag_t *tag = NULL; | 362 | journal_block_tag_t *tag = NULL; |
@@ -481,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
481 | commit_transaction->t_state = T_FLUSH; | 483 | commit_transaction->t_state = T_FLUSH; |
482 | journal->j_committing_transaction = commit_transaction; | 484 | journal->j_committing_transaction = commit_transaction; |
483 | journal->j_running_transaction = NULL; | 485 | journal->j_running_transaction = NULL; |
486 | start_time = ktime_get(); | ||
484 | commit_transaction->t_log_start = journal->j_head; | 487 | commit_transaction->t_log_start = journal->j_head; |
485 | wake_up(&journal->j_wait_transaction_locked); | 488 | wake_up(&journal->j_wait_transaction_locked); |
486 | spin_unlock(&journal->j_state_lock); | 489 | spin_unlock(&journal->j_state_lock); |
@@ -995,6 +998,17 @@ restart_loop: | |||
995 | J_ASSERT(commit_transaction == journal->j_committing_transaction); | 998 | J_ASSERT(commit_transaction == journal->j_committing_transaction); |
996 | journal->j_commit_sequence = commit_transaction->t_tid; | 999 | journal->j_commit_sequence = commit_transaction->t_tid; |
997 | journal->j_committing_transaction = NULL; | 1000 | journal->j_committing_transaction = NULL; |
1001 | commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); | ||
1002 | |||
1003 | /* | ||
1004 | * weight the commit time higher than the average time so we don't | ||
1005 | * react too strongly to vast changes in the commit time | ||
1006 | */ | ||
1007 | if (likely(journal->j_average_commit_time)) | ||
1008 | journal->j_average_commit_time = (commit_time + | ||
1009 | journal->j_average_commit_time*3) / 4; | ||
1010 | else | ||
1011 | journal->j_average_commit_time = commit_time; | ||
998 | spin_unlock(&journal->j_state_lock); | 1012 | spin_unlock(&journal->j_state_lock); |
999 | 1013 | ||
1000 | if (journal->j_commit_callback) | 1014 | if (journal->j_commit_callback) |
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 39b7805a599a..13dcbc990f41 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/timer.h> | 25 | #include <linux/timer.h> |
26 | #include <linux/mm.h> | 26 | #include <linux/mm.h> |
27 | #include <linux/highmem.h> | 27 | #include <linux/highmem.h> |
28 | #include <linux/hrtimer.h> | ||
28 | 29 | ||
29 | static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); | 30 | static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); |
30 | 31 | ||
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) | |||
48 | { | 49 | { |
49 | transaction->t_journal = journal; | 50 | transaction->t_journal = journal; |
50 | transaction->t_state = T_RUNNING; | 51 | transaction->t_state = T_RUNNING; |
52 | transaction->t_start_time = ktime_get(); | ||
51 | transaction->t_tid = journal->j_transaction_sequence++; | 53 | transaction->t_tid = journal->j_transaction_sequence++; |
52 | transaction->t_expires = jiffies + journal->j_commit_interval; | 54 | transaction->t_expires = jiffies + journal->j_commit_interval; |
53 | spin_lock_init(&transaction->t_handle_lock); | 55 | spin_lock_init(&transaction->t_handle_lock); |
@@ -1193,7 +1195,7 @@ int jbd2_journal_stop(handle_t *handle) | |||
1193 | { | 1195 | { |
1194 | transaction_t *transaction = handle->h_transaction; | 1196 | transaction_t *transaction = handle->h_transaction; |
1195 | journal_t *journal = transaction->t_journal; | 1197 | journal_t *journal = transaction->t_journal; |
1196 | int old_handle_count, err; | 1198 | int err; |
1197 | pid_t pid; | 1199 | pid_t pid; |
1198 | 1200 | ||
1199 | J_ASSERT(journal_current_handle() == handle); | 1201 | J_ASSERT(journal_current_handle() == handle); |
@@ -1216,24 +1218,52 @@ int jbd2_journal_stop(handle_t *handle) | |||
1216 | /* | 1218 | /* |
1217 | * Implement synchronous transaction batching. If the handle | 1219 | * Implement synchronous transaction batching. If the handle |
1218 | * was synchronous, don't force a commit immediately. Let's | 1220 | * was synchronous, don't force a commit immediately. Let's |
1219 | * yield and let another thread piggyback onto this transaction. | 1221 | * yield and let another thread piggyback onto this |
1220 | * Keep doing that while new threads continue to arrive. | 1222 | * transaction. Keep doing that while new threads continue to |
1221 | * It doesn't cost much - we're about to run a commit and sleep | 1223 | * arrive. It doesn't cost much - we're about to run a commit |
1222 | * on IO anyway. Speeds up many-threaded, many-dir operations | 1224 | * and sleep on IO anyway. Speeds up many-threaded, many-dir |
1223 | * by 30x or more... | 1225 | * operations by 30x or more... |
1226 | * | ||
1227 | * We try and optimize the sleep time against what the | ||
1228 | * underlying disk can do, instead of having a static sleep | ||
1229 | * time. This is useful for the case where our storage is so | ||
1230 | * fast that it is more optimal to go ahead and force a flush | ||
1231 | * and wait for the transaction to be committed than it is to | ||
1232 | * wait for an arbitrary amount of time for new writers to | ||
1233 | * join the transaction. We achieve this by measuring how | ||
1234 | * long it takes to commit a transaction, and compare it with | ||
1235 | * how long this transaction has been running, and if run time | ||
1236 | * < commit time then we sleep for the delta and commit. This | ||
1237 | * greatly helps super fast disks that would see slowdowns as | ||
1238 | * more threads started doing fsyncs. | ||
1224 | * | 1239 | * |
1225 | * But don't do this if this process was the most recent one to | 1240 | * But don't do this if this process was the most recent one |
1226 | * perform a synchronous write. We do this to detect the case where a | 1241 | * to perform a synchronous write. We do this to detect the |
1227 | * single process is doing a stream of sync writes. No point in waiting | 1242 | * case where a single process is doing a stream of sync |
1228 | * for joiners in that case. | 1243 | * writes. No point in waiting for joiners in that case. |
1229 | */ | 1244 | */ |
1230 | pid = current->pid; | 1245 | pid = current->pid; |
1231 | if (handle->h_sync && journal->j_last_sync_writer != pid) { | 1246 | if (handle->h_sync && journal->j_last_sync_writer != pid) { |
1247 | u64 commit_time, trans_time; | ||
1248 | |||
1232 | journal->j_last_sync_writer = pid; | 1249 | journal->j_last_sync_writer = pid; |
1233 | do { | 1250 | |
1234 | old_handle_count = transaction->t_handle_count; | 1251 | spin_lock(&journal->j_state_lock); |
1235 | schedule_timeout_uninterruptible(1); | 1252 | commit_time = journal->j_average_commit_time; |
1236 | } while (old_handle_count != transaction->t_handle_count); | 1253 | spin_unlock(&journal->j_state_lock); |
1254 | |||
1255 | trans_time = ktime_to_ns(ktime_sub(ktime_get(), | ||
1256 | transaction->t_start_time)); | ||
1257 | |||
1258 | commit_time = min_t(u64, commit_time, | ||
1259 | 1000*jiffies_to_usecs(1)); | ||
1260 | |||
1261 | if (trans_time < commit_time) { | ||
1262 | ktime_t expires = ktime_add_ns(ktime_get(), | ||
1263 | commit_time); | ||
1264 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1265 | schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); | ||
1266 | } | ||
1237 | } | 1267 | } |
1238 | 1268 | ||
1239 | current->journal_info = NULL; | 1269 | current->journal_info = NULL; |
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f36645745489..ab8cef130c28 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h | |||
@@ -638,6 +638,11 @@ struct transaction_s | |||
638 | unsigned long t_expires; | 638 | unsigned long t_expires; |
639 | 639 | ||
640 | /* | 640 | /* |
641 | * When this transaction started, in nanoseconds [no locking] | ||
642 | */ | ||
643 | ktime_t t_start_time; | ||
644 | |||
645 | /* | ||
641 | * How many handles used this transaction? [t_handle_lock] | 646 | * How many handles used this transaction? [t_handle_lock] |
642 | */ | 647 | */ |
643 | int t_handle_count; | 648 | int t_handle_count; |
@@ -939,8 +944,18 @@ struct journal_s | |||
939 | struct buffer_head **j_wbuf; | 944 | struct buffer_head **j_wbuf; |
940 | int j_wbufsize; | 945 | int j_wbufsize; |
941 | 946 | ||
947 | /* | ||
948 | * this is the pid of hte last person to run a synchronous operation | ||
949 | * through the journal | ||
950 | */ | ||
942 | pid_t j_last_sync_writer; | 951 | pid_t j_last_sync_writer; |
943 | 952 | ||
953 | /* | ||
954 | * the average amount of time in nanoseconds it takes to commit a | ||
955 | * transaction to disk. [j_state_lock] | ||
956 | */ | ||
957 | u64 j_average_commit_time; | ||
958 | |||
944 | /* This function is called when a transaction is closed */ | 959 | /* This function is called when a transaction is closed */ |
945 | void (*j_commit_callback)(journal_t *, | 960 | void (*j_commit_callback)(journal_t *, |
946 | transaction_t *); | 961 | transaction_t *); |