aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTheodore Ts'o <tytso@mit.edu>2009-01-03 20:27:38 -0500
committerTheodore Ts'o <tytso@mit.edu>2009-01-03 20:27:38 -0500
commit30773840c19cea60dcef39545960d541b1ac1cf8 (patch)
treef220a2dce451a40dc7264e8fd70c77c5a3908873
parentd7cfa4684d82f58e5d7cb73b8a3c88c169937f25 (diff)
ext4: add fsync batch tuning knobs
Add new mount options, min_batch_time and max_batch_time, which controls how long the jbd2 layer should wait for additional filesystem operations to get batched with a synchronous write transaction. Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--Documentation/filesystems/ext4.txt29
-rw-r--r--fs/ext4/ext4.h7
-rw-r--r--fs/ext4/ext4_sb.h2
-rw-r--r--fs/ext4/super.c47
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jbd2/transaction.c4
-rw-r--r--include/linux/jbd2.h8
7 files changed, 91 insertions, 8 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index f75ab101c00..e3fcbea3ec8 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -283,6 +283,35 @@ delalloc (*) Deferring block allocation until write-out time.
283nodelalloc Disable delayed allocation. Blocks are allocation 283nodelalloc Disable delayed allocation. Blocks are allocation
284 when data is copied from user to page cache. 284 when data is copied from user to page cache.
285 285
286max_batch_time=usec Maximum amount of time ext4 should wait for
287 additional filesystem operations to be batch
288 together with a synchronous write operation.
289 Since a synchronous write operation is going to
290 force a commit and then a wait for the I/O
291 complete, it doesn't cost much, and can be a
292 huge throughput win, we wait for a small amount
293 of time to see if any other transactions can
294 piggyback on the synchronous write. The
295 algorithm used is designed to automatically tune
296 for the speed of the disk, by measuring the
297 amount of time (on average) that it takes to
298 finish committing a transaction. Call this time
299 the "commit time". If the time that the
300 transactoin has been running is less than the
301 commit time, ext4 will try sleeping for the
302 commit time to see if other operations will join
303 the transaction. The commit time is capped by
304 the max_batch_time, which defaults to 15000us
305 (15ms). This optimization can be turned off
306 entirely by setting max_batch_time to 0.
307
308min_batch_time=usec This parameter sets the commit time (as
309 described above) to be at least min_batch_time.
310 It defaults to zero microseconds. Increasing
311 this parameter may improve the throughput of
312 multi-threaded, synchronous workloads on very
313 fast disks, at the cost of increasing latency.
314
286Data Mode 315Data Mode
287========= 316=========
288There are 3 different data modes: 317There are 3 different data modes:
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ac8551e0b70..9ba9fd6d14d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -328,6 +328,7 @@ struct ext4_mount_options {
328 uid_t s_resuid; 328 uid_t s_resuid;
329 gid_t s_resgid; 329 gid_t s_resgid;
330 unsigned long s_commit_interval; 330 unsigned long s_commit_interval;
331 u32 s_min_batch_time, s_max_batch_time;
331#ifdef CONFIG_QUOTA 332#ifdef CONFIG_QUOTA
332 int s_jquota_fmt; 333 int s_jquota_fmt;
333 char *s_qf_names[MAXQUOTAS]; 334 char *s_qf_names[MAXQUOTAS];
@@ -806,6 +807,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
806#define EXT4_DEFM_JMODE_WBACK 0x0060 807#define EXT4_DEFM_JMODE_WBACK 0x0060
807 808
808/* 809/*
810 * Default journal batch times
811 */
812#define EXT4_DEF_MIN_BATCH_TIME 0
813#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
814
815/*
809 * Structure of a directory entry 816 * Structure of a directory entry
810 */ 817 */
811#define EXT4_NAME_LEN 255 818#define EXT4_NAME_LEN 255
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 3db800f399a..039b6ea1a04 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -74,6 +74,8 @@ struct ext4_sb_info {
74 struct journal_s *s_journal; 74 struct journal_s *s_journal;
75 struct list_head s_orphan; 75 struct list_head s_orphan;
76 unsigned long s_commit_interval; 76 unsigned long s_commit_interval;
77 u32 s_max_batch_time;
78 u32 s_min_batch_time;
77 struct block_device *journal_bdev; 79 struct block_device *journal_bdev;
78#ifdef CONFIG_JBD2_DEBUG 80#ifdef CONFIG_JBD2_DEBUG
79 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ 81 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dc27d4c613c..da377f9521b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -705,10 +705,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
705#endif 705#endif
706 if (!test_opt(sb, RESERVATION)) 706 if (!test_opt(sb, RESERVATION))
707 seq_puts(seq, ",noreservation"); 707 seq_puts(seq, ",noreservation");
708 if (sbi->s_commit_interval) { 708 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
709 seq_printf(seq, ",commit=%u", 709 seq_printf(seq, ",commit=%u",
710 (unsigned) (sbi->s_commit_interval / HZ)); 710 (unsigned) (sbi->s_commit_interval / HZ));
711 } 711 }
712 if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
713 seq_printf(seq, ",min_batch_time=%u",
714 (unsigned) sbi->s_min_batch_time);
715 }
716 if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
717 seq_printf(seq, ",max_batch_time=%u",
718 (unsigned) sbi->s_min_batch_time);
719 }
720
712 /* 721 /*
713 * We're changing the default of barrier mount option, so 722 * We're changing the default of barrier mount option, so
714 * let's always display its mount state so it's clear what its 723 * let's always display its mount state so it's clear what its
@@ -874,7 +883,8 @@ enum {
874 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, 883 Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
875 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 884 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
876 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, 885 Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
877 Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, 886 Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
887 Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
878 Opt_journal_checksum, Opt_journal_async_commit, 888 Opt_journal_checksum, Opt_journal_async_commit,
879 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 889 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
880 Opt_data_err_abort, Opt_data_err_ignore, 890 Opt_data_err_abort, Opt_data_err_ignore,
@@ -913,6 +923,8 @@ static const match_table_t tokens = {
913 {Opt_nobh, "nobh"}, 923 {Opt_nobh, "nobh"},
914 {Opt_bh, "bh"}, 924 {Opt_bh, "bh"},
915 {Opt_commit, "commit=%u"}, 925 {Opt_commit, "commit=%u"},
926 {Opt_min_batch_time, "min_batch_time=%u"},
927 {Opt_max_batch_time, "max_batch_time=%u"},
916 {Opt_journal_update, "journal=update"}, 928 {Opt_journal_update, "journal=update"},
917 {Opt_journal_inum, "journal=%u"}, 929 {Opt_journal_inum, "journal=%u"},
918 {Opt_journal_dev, "journal_dev=%u"}, 930 {Opt_journal_dev, "journal_dev=%u"},
@@ -1131,6 +1143,22 @@ static int parse_options(char *options, struct super_block *sb,
1131 option = JBD2_DEFAULT_MAX_COMMIT_AGE; 1143 option = JBD2_DEFAULT_MAX_COMMIT_AGE;
1132 sbi->s_commit_interval = HZ * option; 1144 sbi->s_commit_interval = HZ * option;
1133 break; 1145 break;
1146 case Opt_max_batch_time:
1147 if (match_int(&args[0], &option))
1148 return 0;
1149 if (option < 0)
1150 return 0;
1151 if (option == 0)
1152 option = EXT4_DEF_MAX_BATCH_TIME;
1153 sbi->s_max_batch_time = option;
1154 break;
1155 case Opt_min_batch_time:
1156 if (match_int(&args[0], &option))
1157 return 0;
1158 if (option < 0)
1159 return 0;
1160 sbi->s_min_batch_time = option;
1161 break;
1134 case Opt_data_journal: 1162 case Opt_data_journal:
1135 data_opt = EXT4_MOUNT_JOURNAL_DATA; 1163 data_opt = EXT4_MOUNT_JOURNAL_DATA;
1136 goto datacheck; 1164 goto datacheck;
@@ -1979,6 +2007,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
1979 2007
1980 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 2008 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
1981 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 2009 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
2010 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
2011 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2012 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
1982 2013
1983 set_opt(sbi->s_mount_opt, RESERVATION); 2014 set_opt(sbi->s_mount_opt, RESERVATION);
1984 set_opt(sbi->s_mount_opt, BARRIER); 2015 set_opt(sbi->s_mount_opt, BARRIER);
@@ -2524,11 +2555,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
2524{ 2555{
2525 struct ext4_sb_info *sbi = EXT4_SB(sb); 2556 struct ext4_sb_info *sbi = EXT4_SB(sb);
2526 2557
2527 if (sbi->s_commit_interval) 2558 journal->j_commit_interval = sbi->s_commit_interval;
2528 journal->j_commit_interval = sbi->s_commit_interval; 2559 journal->j_min_batch_time = sbi->s_min_batch_time;
2529 /* We could also set up an ext4-specific default for the commit 2560 journal->j_max_batch_time = sbi->s_max_batch_time;
2530 * interval here, but for now we'll just fall back to the jbd
2531 * default. */
2532 2561
2533 spin_lock(&journal->j_state_lock); 2562 spin_lock(&journal->j_state_lock);
2534 if (test_opt(sb, BARRIER)) 2563 if (test_opt(sb, BARRIER))
@@ -3042,6 +3071,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3042 old_opts.s_resuid = sbi->s_resuid; 3071 old_opts.s_resuid = sbi->s_resuid;
3043 old_opts.s_resgid = sbi->s_resgid; 3072 old_opts.s_resgid = sbi->s_resgid;
3044 old_opts.s_commit_interval = sbi->s_commit_interval; 3073 old_opts.s_commit_interval = sbi->s_commit_interval;
3074 old_opts.s_min_batch_time = sbi->s_min_batch_time;
3075 old_opts.s_max_batch_time = sbi->s_max_batch_time;
3045#ifdef CONFIG_QUOTA 3076#ifdef CONFIG_QUOTA
3046 old_opts.s_jquota_fmt = sbi->s_jquota_fmt; 3077 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
3047 for (i = 0; i < MAXQUOTAS; i++) 3078 for (i = 0; i < MAXQUOTAS; i++)
@@ -3178,6 +3209,8 @@ restore_opts:
3178 sbi->s_resuid = old_opts.s_resuid; 3209 sbi->s_resuid = old_opts.s_resuid;
3179 sbi->s_resgid = old_opts.s_resgid; 3210 sbi->s_resgid = old_opts.s_resgid;
3180 sbi->s_commit_interval = old_opts.s_commit_interval; 3211 sbi->s_commit_interval = old_opts.s_commit_interval;
3212 sbi->s_min_batch_time = old_opts.s_min_batch_time;
3213 sbi->s_max_batch_time = old_opts.s_max_batch_time;
3181#ifdef CONFIG_QUOTA 3214#ifdef CONFIG_QUOTA
3182 sbi->s_jquota_fmt = old_opts.s_jquota_fmt; 3215 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
3183 for (i = 0; i < MAXQUOTAS; i++) { 3216 for (i = 0; i < MAXQUOTAS; i++) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 74d87290381..fd1d7557a09 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -964,6 +964,8 @@ static journal_t * journal_init_common (void)
964 spin_lock_init(&journal->j_state_lock); 964 spin_lock_init(&journal->j_state_lock);
965 965
966 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 966 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
967 journal->j_min_batch_time = 0;
968 journal->j_max_batch_time = 15000; /* 15ms */
967 969
968 /* The journal is marked for error until we succeed with recovery! */ 970 /* The journal is marked for error until we succeed with recovery! */
969 journal->j_flags = JBD2_ABORT; 971 journal->j_flags = JBD2_ABORT;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 13dcbc990f4..48c21bac5a5 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1255,8 +1255,10 @@ int jbd2_journal_stop(handle_t *handle)
1255 trans_time = ktime_to_ns(ktime_sub(ktime_get(), 1255 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1256 transaction->t_start_time)); 1256 transaction->t_start_time));
1257 1257
1258 commit_time = max_t(u64, commit_time,
1259 1000*journal->j_min_batch_time);
1258 commit_time = min_t(u64, commit_time, 1260 commit_time = min_t(u64, commit_time,
1259 1000*jiffies_to_usecs(1)); 1261 1000*journal->j_max_batch_time);
1260 1262
1261 if (trans_time < commit_time) { 1263 if (trans_time < commit_time) {
1262 ktime_t expires = ktime_add_ns(ktime_get(), 1264 ktime_t expires = ktime_add_ns(ktime_get(),
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index ab8cef130c2..a3cd647ea1b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -956,6 +956,14 @@ struct journal_s
956 */ 956 */
957 u64 j_average_commit_time; 957 u64 j_average_commit_time;
958 958
959 /*
960 * minimum and maximum times that we should wait for
961 * additional filesystem operations to get batched into a
962 * synchronous handle in microseconds
963 */
964 u32 j_min_batch_time;
965 u32 j_max_batch_time;
966
959 /* This function is called when a transaction is closed */ 967 /* This function is called when a transaction is closed */
960 void (*j_commit_callback)(journal_t *, 968 void (*j_commit_callback)(journal_t *,
961 transaction_t *); 969 transaction_t *);