aboutsummaryrefslogtreecommitdiffstats
path: root/fs/jbd2
diff options
context:
space:
mode:
Diffstat (limited to 'fs/jbd2')
-rw-r--r--fs/jbd2/checkpoint.c24
-rw-r--r--fs/jbd2/commit.c67
-rw-r--r--fs/jbd2/journal.c143
-rw-r--r--fs/jbd2/transaction.c107
4 files changed, 217 insertions, 124 deletions
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9497718fe920..17159cacbd9e 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -249,16 +249,14 @@ restart:
249 return ret; 249 return ret;
250} 250}
251 251
252#define NR_BATCH 64
253
254static void 252static void
255__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) 253__flush_batch(journal_t *journal, int *batch_count)
256{ 254{
257 int i; 255 int i;
258 256
259 ll_rw_block(SWRITE, *batch_count, bhs); 257 ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
260 for (i = 0; i < *batch_count; i++) { 258 for (i = 0; i < *batch_count; i++) {
261 struct buffer_head *bh = bhs[i]; 259 struct buffer_head *bh = journal->j_chkpt_bhs[i];
262 clear_buffer_jwrite(bh); 260 clear_buffer_jwrite(bh);
263 BUFFER_TRACE(bh, "brelse"); 261 BUFFER_TRACE(bh, "brelse");
264 __brelse(bh); 262 __brelse(bh);
@@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
277 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 275 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
278 */ 276 */
279static int __process_buffer(journal_t *journal, struct journal_head *jh, 277static int __process_buffer(journal_t *journal, struct journal_head *jh,
280 struct buffer_head **bhs, int *batch_count, 278 int *batch_count, transaction_t *transaction)
281 transaction_t *transaction)
282{ 279{
283 struct buffer_head *bh = jh2bh(jh); 280 struct buffer_head *bh = jh2bh(jh);
284 int ret = 0; 281 int ret = 0;
@@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
325 get_bh(bh); 322 get_bh(bh);
326 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 323 J_ASSERT_BH(bh, !buffer_jwrite(bh));
327 set_buffer_jwrite(bh); 324 set_buffer_jwrite(bh);
328 bhs[*batch_count] = bh; 325 journal->j_chkpt_bhs[*batch_count] = bh;
329 __buffer_relink_io(jh); 326 __buffer_relink_io(jh);
330 jbd_unlock_bh_state(bh); 327 jbd_unlock_bh_state(bh);
331 transaction->t_chp_stats.cs_written++; 328 transaction->t_chp_stats.cs_written++;
332 (*batch_count)++; 329 (*batch_count)++;
333 if (*batch_count == NR_BATCH) { 330 if (*batch_count == JBD2_NR_BATCH) {
334 spin_unlock(&journal->j_list_lock); 331 spin_unlock(&journal->j_list_lock);
335 __flush_batch(journal, bhs, batch_count); 332 __flush_batch(journal, batch_count);
336 ret = 1; 333 ret = 1;
337 } 334 }
338 } 335 }
@@ -388,7 +385,6 @@ restart:
388 if (journal->j_checkpoint_transactions == transaction && 385 if (journal->j_checkpoint_transactions == transaction &&
389 transaction->t_tid == this_tid) { 386 transaction->t_tid == this_tid) {
390 int batch_count = 0; 387 int batch_count = 0;
391 struct buffer_head *bhs[NR_BATCH];
392 struct journal_head *jh; 388 struct journal_head *jh;
393 int retry = 0, err; 389 int retry = 0, err;
394 390
@@ -402,7 +398,7 @@ restart:
402 retry = 1; 398 retry = 1;
403 break; 399 break;
404 } 400 }
405 retry = __process_buffer(journal, jh, bhs, &batch_count, 401 retry = __process_buffer(journal, jh, &batch_count,
406 transaction); 402 transaction);
407 if (retry < 0 && !result) 403 if (retry < 0 && !result)
408 result = retry; 404 result = retry;
@@ -419,7 +415,7 @@ restart:
419 spin_unlock(&journal->j_list_lock); 415 spin_unlock(&journal->j_list_lock);
420 retry = 1; 416 retry = 1;
421 } 417 }
422 __flush_batch(journal, bhs, &batch_count); 418 __flush_batch(journal, &batch_count);
423 } 419 }
424 420
425 if (retry) { 421 if (retry) {
@@ -686,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
686 safely remove this transaction from the log */ 682 safely remove this transaction from the log */
687 683
688 __jbd2_journal_drop_transaction(journal, transaction); 684 __jbd2_journal_drop_transaction(journal, transaction);
685 kfree(transaction);
689 686
690 /* Just in case anybody was waiting for more transactions to be 687 /* Just in case anybody was waiting for more transactions to be
691 checkpointed... */ 688 checkpointed... */
@@ -760,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
760 J_ASSERT(journal->j_running_transaction != transaction); 757 J_ASSERT(journal->j_running_transaction != transaction);
761 758
762 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); 759 jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
763 kfree(transaction);
764} 760}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index ebc667bc54a8..62804e57a44c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
25#include <linux/crc32.h> 25#include <linux/crc32.h>
26#include <linux/writeback.h> 26#include <linux/writeback.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/bio.h>
28 29
29/* 30/*
30 * Default IO end handler for temporary BJ_IO buffer_heads. 31 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -137,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
137 set_buffer_ordered(bh); 138 set_buffer_ordered(bh);
138 barrier_done = 1; 139 barrier_done = 1;
139 } 140 }
140 ret = submit_bh(WRITE, bh); 141 ret = submit_bh(WRITE_SYNC, bh);
141 if (barrier_done) 142 if (barrier_done)
142 clear_buffer_ordered(bh); 143 clear_buffer_ordered(bh);
143 144
@@ -158,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
158 lock_buffer(bh); 159 lock_buffer(bh);
159 set_buffer_uptodate(bh); 160 set_buffer_uptodate(bh);
160 clear_buffer_dirty(bh); 161 clear_buffer_dirty(bh);
161 ret = submit_bh(WRITE, bh); 162 ret = submit_bh(WRITE_SYNC, bh);
162 } 163 }
163 *cbh = bh; 164 *cbh = bh;
164 return ret; 165 return ret;
@@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal,
168 * This function along with journal_submit_commit_record 169 * This function along with journal_submit_commit_record
169 * allows to write the commit record asynchronously. 170 * allows to write the commit record asynchronously.
170 */ 171 */
171static int journal_wait_on_commit_record(struct buffer_head *bh) 172static int journal_wait_on_commit_record(journal_t *journal,
173 struct buffer_head *bh)
172{ 174{
173 int ret = 0; 175 int ret = 0;
174 176
177retry:
175 clear_buffer_dirty(bh); 178 clear_buffer_dirty(bh);
176 wait_on_buffer(bh); 179 wait_on_buffer(bh);
180 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
181 printk(KERN_WARNING
182 "JBD2: wait_on_commit_record: sync failed on %s - "
183 "disabling barriers\n", journal->j_devname);
184 spin_lock(&journal->j_state_lock);
185 journal->j_flags &= ~JBD2_BARRIER;
186 spin_unlock(&journal->j_state_lock);
187
188 lock_buffer(bh);
189 clear_buffer_dirty(bh);
190 set_buffer_uptodate(bh);
191 bh->b_end_io = journal_end_buffer_io_sync;
192
193 ret = submit_bh(WRITE_SYNC, bh);
194 if (ret) {
195 unlock_buffer(bh);
196 return ret;
197 }
198 goto retry;
199 }
177 200
178 if (unlikely(!buffer_uptodate(bh))) 201 if (unlikely(!buffer_uptodate(bh)))
179 ret = -EIO; 202 ret = -EIO;
@@ -332,13 +355,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
332 int flags; 355 int flags;
333 int err; 356 int err;
334 unsigned long long blocknr; 357 unsigned long long blocknr;
358 ktime_t start_time;
359 u64 commit_time;
335 char *tagp = NULL; 360 char *tagp = NULL;
336 journal_header_t *header; 361 journal_header_t *header;
337 journal_block_tag_t *tag = NULL; 362 journal_block_tag_t *tag = NULL;
338 int space_left = 0; 363 int space_left = 0;
339 int first_tag = 0; 364 int first_tag = 0;
340 int tag_flag; 365 int tag_flag;
341 int i; 366 int i, to_free = 0;
342 int tag_bytes = journal_tag_bytes(journal); 367 int tag_bytes = journal_tag_bytes(journal);
343 struct buffer_head *cbh = NULL; /* For transactional checksums */ 368 struct buffer_head *cbh = NULL; /* For transactional checksums */
344 __u32 crc32_sum = ~0; 369 __u32 crc32_sum = ~0;
@@ -458,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
458 commit_transaction->t_state = T_FLUSH; 483 commit_transaction->t_state = T_FLUSH;
459 journal->j_committing_transaction = commit_transaction; 484 journal->j_committing_transaction = commit_transaction;
460 journal->j_running_transaction = NULL; 485 journal->j_running_transaction = NULL;
486 start_time = ktime_get();
461 commit_transaction->t_log_start = journal->j_head; 487 commit_transaction->t_log_start = journal->j_head;
462 wake_up(&journal->j_wait_transaction_locked); 488 wake_up(&journal->j_wait_transaction_locked);
463 spin_unlock(&journal->j_state_lock); 489 spin_unlock(&journal->j_state_lock);
@@ -509,6 +535,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
509 if (is_journal_aborted(journal)) { 535 if (is_journal_aborted(journal)) {
510 clear_buffer_jbddirty(jh2bh(jh)); 536 clear_buffer_jbddirty(jh2bh(jh));
511 JBUFFER_TRACE(jh, "journal is aborting: refile"); 537 JBUFFER_TRACE(jh, "journal is aborting: refile");
538 jbd2_buffer_abort_trigger(jh,
539 jh->b_frozen_data ?
540 jh->b_frozen_triggers :
541 jh->b_triggers);
512 jbd2_journal_refile_buffer(journal, jh); 542 jbd2_journal_refile_buffer(journal, jh);
513 /* If that was the last one, we need to clean up 543 /* If that was the last one, we need to clean up
514 * any descriptor buffers which may have been 544 * any descriptor buffers which may have been
@@ -799,7 +829,7 @@ wait_for_iobuf:
799 __jbd2_journal_abort_hard(journal); 829 __jbd2_journal_abort_hard(journal);
800 } 830 }
801 if (!err && !is_journal_aborted(journal)) 831 if (!err && !is_journal_aborted(journal))
802 err = journal_wait_on_commit_record(cbh); 832 err = journal_wait_on_commit_record(journal, cbh);
803 833
804 if (err) 834 if (err)
805 jbd2_journal_abort(journal, err); 835 jbd2_journal_abort(journal, err);
@@ -844,6 +874,9 @@ restart_loop:
844 * data. 874 * data.
845 * 875 *
846 * Otherwise, we can just throw away the frozen data now. 876 * Otherwise, we can just throw away the frozen data now.
877 *
878 * We also know that the frozen data has already fired
879 * its triggers if they exist, so we can clear that too.
847 */ 880 */
848 if (jh->b_committed_data) { 881 if (jh->b_committed_data) {
849 jbd2_free(jh->b_committed_data, bh->b_size); 882 jbd2_free(jh->b_committed_data, bh->b_size);
@@ -851,10 +884,12 @@ restart_loop:
851 if (jh->b_frozen_data) { 884 if (jh->b_frozen_data) {
852 jh->b_committed_data = jh->b_frozen_data; 885 jh->b_committed_data = jh->b_frozen_data;
853 jh->b_frozen_data = NULL; 886 jh->b_frozen_data = NULL;
887 jh->b_frozen_triggers = NULL;
854 } 888 }
855 } else if (jh->b_frozen_data) { 889 } else if (jh->b_frozen_data) {
856 jbd2_free(jh->b_frozen_data, bh->b_size); 890 jbd2_free(jh->b_frozen_data, bh->b_size);
857 jh->b_frozen_data = NULL; 891 jh->b_frozen_data = NULL;
892 jh->b_frozen_triggers = NULL;
858 } 893 }
859 894
860 spin_lock(&journal->j_list_lock); 895 spin_lock(&journal->j_list_lock);
@@ -972,14 +1007,23 @@ restart_loop:
972 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1007 J_ASSERT(commit_transaction == journal->j_committing_transaction);
973 journal->j_commit_sequence = commit_transaction->t_tid; 1008 journal->j_commit_sequence = commit_transaction->t_tid;
974 journal->j_committing_transaction = NULL; 1009 journal->j_committing_transaction = NULL;
975 spin_unlock(&journal->j_state_lock); 1010 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
976 1011
977 if (journal->j_commit_callback) 1012 /*
978 journal->j_commit_callback(journal, commit_transaction); 1013 * weight the commit time higher than the average time so we don't
1014 * react too strongly to vast changes in the commit time
1015 */
1016 if (likely(journal->j_average_commit_time))
1017 journal->j_average_commit_time = (commit_time +
1018 journal->j_average_commit_time*3) / 4;
1019 else
1020 journal->j_average_commit_time = commit_time;
1021 spin_unlock(&journal->j_state_lock);
979 1022
980 if (commit_transaction->t_checkpoint_list == NULL && 1023 if (commit_transaction->t_checkpoint_list == NULL &&
981 commit_transaction->t_checkpoint_io_list == NULL) { 1024 commit_transaction->t_checkpoint_io_list == NULL) {
982 __jbd2_journal_drop_transaction(journal, commit_transaction); 1025 __jbd2_journal_drop_transaction(journal, commit_transaction);
1026 to_free = 1;
983 } else { 1027 } else {
984 if (journal->j_checkpoint_transactions == NULL) { 1028 if (journal->j_checkpoint_transactions == NULL) {
985 journal->j_checkpoint_transactions = commit_transaction; 1029 journal->j_checkpoint_transactions = commit_transaction;
@@ -998,11 +1042,16 @@ restart_loop:
998 } 1042 }
999 spin_unlock(&journal->j_list_lock); 1043 spin_unlock(&journal->j_list_lock);
1000 1044
1045 if (journal->j_commit_callback)
1046 journal->j_commit_callback(journal, commit_transaction);
1047
1001 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", 1048 trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
1002 journal->j_devname, journal->j_commit_sequence, 1049 journal->j_devname, commit_transaction->t_tid,
1003 journal->j_tail_sequence); 1050 journal->j_tail_sequence);
1004 jbd_debug(1, "JBD: commit %d complete, head %d\n", 1051 jbd_debug(1, "JBD: commit %d complete, head %d\n",
1005 journal->j_commit_sequence, journal->j_tail_sequence); 1052 journal->j_commit_sequence, journal->j_tail_sequence);
1053 if (to_free)
1054 kfree(commit_transaction);
1006 1055
1007 wake_up(&journal->j_wait_done_commit); 1056 wake_up(&journal->j_wait_done_commit);
1008} 1057}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e70d657a19f8..56675306ed81 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -40,6 +40,7 @@
40 40
41#include <asm/uaccess.h> 41#include <asm/uaccess.h>
42#include <asm/page.h> 42#include <asm/page.h>
43#include <asm/div64.h>
43 44
44EXPORT_SYMBOL(jbd2_journal_start); 45EXPORT_SYMBOL(jbd2_journal_start);
45EXPORT_SYMBOL(jbd2_journal_restart); 46EXPORT_SYMBOL(jbd2_journal_restart);
@@ -50,6 +51,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
50EXPORT_SYMBOL(jbd2_journal_get_write_access); 51EXPORT_SYMBOL(jbd2_journal_get_write_access);
51EXPORT_SYMBOL(jbd2_journal_get_create_access); 52EXPORT_SYMBOL(jbd2_journal_get_create_access);
52EXPORT_SYMBOL(jbd2_journal_get_undo_access); 53EXPORT_SYMBOL(jbd2_journal_get_undo_access);
54EXPORT_SYMBOL(jbd2_journal_set_triggers);
53EXPORT_SYMBOL(jbd2_journal_dirty_metadata); 55EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
54EXPORT_SYMBOL(jbd2_journal_release_buffer); 56EXPORT_SYMBOL(jbd2_journal_release_buffer);
55EXPORT_SYMBOL(jbd2_journal_forget); 57EXPORT_SYMBOL(jbd2_journal_forget);
@@ -65,7 +67,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format);
65EXPORT_SYMBOL(jbd2_journal_check_used_features); 67EXPORT_SYMBOL(jbd2_journal_check_used_features);
66EXPORT_SYMBOL(jbd2_journal_check_available_features); 68EXPORT_SYMBOL(jbd2_journal_check_available_features);
67EXPORT_SYMBOL(jbd2_journal_set_features); 69EXPORT_SYMBOL(jbd2_journal_set_features);
68EXPORT_SYMBOL(jbd2_journal_create);
69EXPORT_SYMBOL(jbd2_journal_load); 70EXPORT_SYMBOL(jbd2_journal_load);
70EXPORT_SYMBOL(jbd2_journal_destroy); 71EXPORT_SYMBOL(jbd2_journal_destroy);
71EXPORT_SYMBOL(jbd2_journal_abort); 72EXPORT_SYMBOL(jbd2_journal_abort);
@@ -131,8 +132,9 @@ static int kjournald2(void *arg)
131 journal->j_task = current; 132 journal->j_task = current;
132 wake_up(&journal->j_wait_done_commit); 133 wake_up(&journal->j_wait_done_commit);
133 134
134 printk(KERN_INFO "kjournald2 starting. Commit interval %ld seconds\n", 135 printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
135 journal->j_commit_interval / HZ); 136 "commit interval %ld seconds\n", current->pid,
137 journal->j_devname, journal->j_commit_interval / HZ);
136 138
137 /* 139 /*
138 * And now, wait forever for commit wakeup events. 140 * And now, wait forever for commit wakeup events.
@@ -290,6 +292,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
290 struct page *new_page; 292 struct page *new_page;
291 unsigned int new_offset; 293 unsigned int new_offset;
292 struct buffer_head *bh_in = jh2bh(jh_in); 294 struct buffer_head *bh_in = jh2bh(jh_in);
295 struct jbd2_buffer_trigger_type *triggers;
293 296
294 /* 297 /*
295 * The buffer really shouldn't be locked: only the current committing 298 * The buffer really shouldn't be locked: only the current committing
@@ -314,13 +317,23 @@ repeat:
314 done_copy_out = 1; 317 done_copy_out = 1;
315 new_page = virt_to_page(jh_in->b_frozen_data); 318 new_page = virt_to_page(jh_in->b_frozen_data);
316 new_offset = offset_in_page(jh_in->b_frozen_data); 319 new_offset = offset_in_page(jh_in->b_frozen_data);
320 triggers = jh_in->b_frozen_triggers;
317 } else { 321 } else {
318 new_page = jh2bh(jh_in)->b_page; 322 new_page = jh2bh(jh_in)->b_page;
319 new_offset = offset_in_page(jh2bh(jh_in)->b_data); 323 new_offset = offset_in_page(jh2bh(jh_in)->b_data);
324 triggers = jh_in->b_triggers;
320 } 325 }
321 326
322 mapped_data = kmap_atomic(new_page, KM_USER0); 327 mapped_data = kmap_atomic(new_page, KM_USER0);
323 /* 328 /*
329 * Fire any commit trigger. Do this before checking for escaping,
330 * as the trigger may modify the magic offset. If a copy-out
331 * happens afterwards, it will have the correct data in the buffer.
332 */
333 jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
334 triggers);
335
336 /*
324 * Check for escaping 337 * Check for escaping
325 */ 338 */
326 if (*((__be32 *)(mapped_data + new_offset)) == 339 if (*((__be32 *)(mapped_data + new_offset)) ==
@@ -352,6 +365,13 @@ repeat:
352 new_page = virt_to_page(tmp); 365 new_page = virt_to_page(tmp);
353 new_offset = offset_in_page(tmp); 366 new_offset = offset_in_page(tmp);
354 done_copy_out = 1; 367 done_copy_out = 1;
368
369 /*
370 * This isn't strictly necessary, as we're using frozen
371 * data for the escaping, but it keeps consistency with
372 * b_frozen_data usage.
373 */
374 jh_in->b_frozen_triggers = jh_in->b_triggers;
355 } 375 }
356 376
357 /* 377 /*
@@ -631,6 +651,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
631 return NULL; 651 return NULL;
632 652
633 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 653 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
654 if (!bh)
655 return NULL;
634 lock_buffer(bh); 656 lock_buffer(bh);
635 memset(bh->b_data, 0, journal->j_blocksize); 657 memset(bh->b_data, 0, journal->j_blocksize);
636 set_buffer_uptodate(bh); 658 set_buffer_uptodate(bh);
@@ -824,6 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
824 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); 846 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
825 seq_printf(seq, " %ums logging transaction\n", 847 seq_printf(seq, " %ums logging transaction\n",
826 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); 848 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
849 seq_printf(seq, " %luus average transaction commit time\n",
850 do_div(s->journal->j_average_commit_time, 1000));
827 seq_printf(seq, " %lu handles per transaction\n", 851 seq_printf(seq, " %lu handles per transaction\n",
828 s->stats->u.run.rs_handle_count / s->stats->ts_tid); 852 s->stats->u.run.rs_handle_count / s->stats->ts_tid);
829 seq_printf(seq, " %lu blocks per transaction\n", 853 seq_printf(seq, " %lu blocks per transaction\n",
@@ -961,6 +985,8 @@ static journal_t * journal_init_common (void)
961 spin_lock_init(&journal->j_state_lock); 985 spin_lock_init(&journal->j_state_lock);
962 986
963 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 987 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
988 journal->j_min_batch_time = 0;
989 journal->j_max_batch_time = 15000; /* 15ms */
964 990
965 /* The journal is marked for error until we succeed with recovery! */ 991 /* The journal is marked for error until we succeed with recovery! */
966 journal->j_flags = JBD2_ABORT; 992 journal->j_flags = JBD2_ABORT;
@@ -1016,15 +1042,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1016 1042
1017 /* journal descriptor can store up to n blocks -bzzz */ 1043 /* journal descriptor can store up to n blocks -bzzz */
1018 journal->j_blocksize = blocksize; 1044 journal->j_blocksize = blocksize;
1045 jbd2_stats_proc_init(journal);
1019 n = journal->j_blocksize / sizeof(journal_block_tag_t); 1046 n = journal->j_blocksize / sizeof(journal_block_tag_t);
1020 journal->j_wbufsize = n; 1047 journal->j_wbufsize = n;
1021 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 1048 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
1022 if (!journal->j_wbuf) { 1049 if (!journal->j_wbuf) {
1023 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 1050 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
1024 __func__); 1051 __func__);
1025 kfree(journal); 1052 goto out_err;
1026 journal = NULL;
1027 goto out;
1028 } 1053 }
1029 journal->j_dev = bdev; 1054 journal->j_dev = bdev;
1030 journal->j_fs_dev = fs_dev; 1055 journal->j_fs_dev = fs_dev;
@@ -1034,14 +1059,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1034 p = journal->j_devname; 1059 p = journal->j_devname;
1035 while ((p = strchr(p, '/'))) 1060 while ((p = strchr(p, '/')))
1036 *p = '!'; 1061 *p = '!';
1037 jbd2_stats_proc_init(journal);
1038 1062
1039 bh = __getblk(journal->j_dev, start, journal->j_blocksize); 1063 bh = __getblk(journal->j_dev, start, journal->j_blocksize);
1040 J_ASSERT(bh != NULL); 1064 if (!bh) {
1065 printk(KERN_ERR
1066 "%s: Cannot get buffer for journal superblock\n",
1067 __func__);
1068 goto out_err;
1069 }
1041 journal->j_sb_buffer = bh; 1070 journal->j_sb_buffer = bh;
1042 journal->j_superblock = (journal_superblock_t *)bh->b_data; 1071 journal->j_superblock = (journal_superblock_t *)bh->b_data;
1043out: 1072
1044 return journal; 1073 return journal;
1074out_err:
1075 jbd2_stats_proc_exit(journal);
1076 kfree(journal);
1077 return NULL;
1045} 1078}
1046 1079
1047/** 1080/**
@@ -1089,9 +1122,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1089 if (!journal->j_wbuf) { 1122 if (!journal->j_wbuf) {
1090 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 1123 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
1091 __func__); 1124 __func__);
1092 jbd2_stats_proc_exit(journal); 1125 goto out_err;
1093 kfree(journal);
1094 return NULL;
1095 } 1126 }
1096 1127
1097 err = jbd2_journal_bmap(journal, 0, &blocknr); 1128 err = jbd2_journal_bmap(journal, 0, &blocknr);
@@ -1099,17 +1130,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1099 if (err) { 1130 if (err) {
1100 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 1131 printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
1101 __func__); 1132 __func__);
1102 jbd2_stats_proc_exit(journal); 1133 goto out_err;
1103 kfree(journal);
1104 return NULL;
1105 } 1134 }
1106 1135
1107 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 1136 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
1108 J_ASSERT(bh != NULL); 1137 if (!bh) {
1138 printk(KERN_ERR
1139 "%s: Cannot get buffer for journal superblock\n",
1140 __func__);
1141 goto out_err;
1142 }
1109 journal->j_sb_buffer = bh; 1143 journal->j_sb_buffer = bh;
1110 journal->j_superblock = (journal_superblock_t *)bh->b_data; 1144 journal->j_superblock = (journal_superblock_t *)bh->b_data;
1111 1145
1112 return journal; 1146 return journal;
1147out_err:
1148 jbd2_stats_proc_exit(journal);
1149 kfree(journal);
1150 return NULL;
1113} 1151}
1114 1152
1115/* 1153/*
@@ -1158,77 +1196,6 @@ static int journal_reset(journal_t *journal)
1158} 1196}
1159 1197
1160/** 1198/**
1161 * int jbd2_journal_create() - Initialise the new journal file
1162 * @journal: Journal to create. This structure must have been initialised
1163 *
1164 * Given a journal_t structure which tells us which disk blocks we can
1165 * use, create a new journal superblock and initialise all of the
1166 * journal fields from scratch.
1167 **/
1168int jbd2_journal_create(journal_t *journal)
1169{
1170 unsigned long long blocknr;
1171 struct buffer_head *bh;
1172 journal_superblock_t *sb;
1173 int i, err;
1174
1175 if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
1176 printk (KERN_ERR "Journal length (%d blocks) too short.\n",
1177 journal->j_maxlen);
1178 journal_fail_superblock(journal);
1179 return -EINVAL;
1180 }
1181
1182 if (journal->j_inode == NULL) {
1183 /*
1184 * We don't know what block to start at!
1185 */
1186 printk(KERN_EMERG
1187 "%s: creation of journal on external device!\n",
1188 __func__);
1189 BUG();
1190 }
1191
1192 /* Zero out the entire journal on disk. We cannot afford to
1193 have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
1194 jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
1195 for (i = 0; i < journal->j_maxlen; i++) {
1196 err = jbd2_journal_bmap(journal, i, &blocknr);
1197 if (err)
1198 return err;
1199 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
1200 lock_buffer(bh);
1201 memset (bh->b_data, 0, journal->j_blocksize);
1202 BUFFER_TRACE(bh, "marking dirty");
1203 mark_buffer_dirty(bh);
1204 BUFFER_TRACE(bh, "marking uptodate");
1205 set_buffer_uptodate(bh);
1206 unlock_buffer(bh);
1207 __brelse(bh);
1208 }
1209
1210 sync_blockdev(journal->j_dev);
1211 jbd_debug(1, "JBD: journal cleared.\n");
1212
1213 /* OK, fill in the initial static fields in the new superblock */
1214 sb = journal->j_superblock;
1215
1216 sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
1217 sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
1218
1219 sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
1220 sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
1221 sb->s_first = cpu_to_be32(1);
1222
1223 journal->j_transaction_sequence = 1;
1224
1225 journal->j_flags &= ~JBD2_ABORT;
1226 journal->j_format_version = 2;
1227
1228 return journal_reset(journal);
1229}
1230
1231/**
1232 * void jbd2_journal_update_superblock() - Update journal sb on disk. 1199 * void jbd2_journal_update_superblock() - Update journal sb on disk.
1233 * @journal: The journal to update. 1200 * @journal: The journal to update.
1234 * @wait: Set to '0' if you don't want to wait for IO completion. 1201 * @wait: Set to '0' if you don't want to wait for IO completion.
@@ -1472,7 +1439,9 @@ int jbd2_journal_destroy(journal_t *journal)
1472 spin_lock(&journal->j_list_lock); 1439 spin_lock(&journal->j_list_lock);
1473 while (journal->j_checkpoint_transactions != NULL) { 1440 while (journal->j_checkpoint_transactions != NULL) {
1474 spin_unlock(&journal->j_list_lock); 1441 spin_unlock(&journal->j_list_lock);
1442 mutex_lock(&journal->j_checkpoint_mutex);
1475 jbd2_log_do_checkpoint(journal); 1443 jbd2_log_do_checkpoint(journal);
1444 mutex_unlock(&journal->j_checkpoint_mutex);
1476 spin_lock(&journal->j_list_lock); 1445 spin_lock(&journal->j_list_lock);
1477 } 1446 }
1478 1447
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 39b7805a599a..46b4e347ed7d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
25#include <linux/timer.h> 25#include <linux/timer.h>
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h>
28 29
29static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 30static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
30 31
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
48{ 49{
49 transaction->t_journal = journal; 50 transaction->t_journal = journal;
50 transaction->t_state = T_RUNNING; 51 transaction->t_state = T_RUNNING;
52 transaction->t_start_time = ktime_get();
51 transaction->t_tid = journal->j_transaction_sequence++; 53 transaction->t_tid = journal->j_transaction_sequence++;
52 transaction->t_expires = jiffies + journal->j_commit_interval; 54 transaction->t_expires = jiffies + journal->j_commit_interval;
53 spin_lock_init(&transaction->t_handle_lock); 55 spin_lock_init(&transaction->t_handle_lock);
@@ -741,6 +743,12 @@ done:
741 source = kmap_atomic(page, KM_USER0); 743 source = kmap_atomic(page, KM_USER0);
742 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); 744 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
743 kunmap_atomic(source, KM_USER0); 745 kunmap_atomic(source, KM_USER0);
746
747 /*
748 * Now that the frozen data is saved off, we need to store
749 * any matching triggers.
750 */
751 jh->b_frozen_triggers = jh->b_triggers;
744 } 752 }
745 jbd_unlock_bh_state(bh); 753 jbd_unlock_bh_state(bh);
746 754
@@ -944,6 +952,47 @@ out:
944} 952}
945 953
946/** 954/**
955 * void jbd2_journal_set_triggers() - Add triggers for commit writeout
956 * @bh: buffer to trigger on
957 * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
958 *
959 * Set any triggers on this journal_head. This is always safe, because
960 * triggers for a committing buffer will be saved off, and triggers for
961 * a running transaction will match the buffer in that transaction.
962 *
963 * Call with NULL to clear the triggers.
964 */
965void jbd2_journal_set_triggers(struct buffer_head *bh,
966 struct jbd2_buffer_trigger_type *type)
967{
968 struct journal_head *jh = bh2jh(bh);
969
970 jh->b_triggers = type;
971}
972
973void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
974 struct jbd2_buffer_trigger_type *triggers)
975{
976 struct buffer_head *bh = jh2bh(jh);
977
978 if (!triggers || !triggers->t_commit)
979 return;
980
981 triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
982}
983
984void jbd2_buffer_abort_trigger(struct journal_head *jh,
985 struct jbd2_buffer_trigger_type *triggers)
986{
987 if (!triggers || !triggers->t_abort)
988 return;
989
990 triggers->t_abort(triggers, jh2bh(jh));
991}
992
993
994
995/**
947 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata 996 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
948 * @handle: transaction to add buffer to. 997 * @handle: transaction to add buffer to.
949 * @bh: buffer to mark 998 * @bh: buffer to mark
@@ -1193,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle)
1193{ 1242{
1194 transaction_t *transaction = handle->h_transaction; 1243 transaction_t *transaction = handle->h_transaction;
1195 journal_t *journal = transaction->t_journal; 1244 journal_t *journal = transaction->t_journal;
1196 int old_handle_count, err; 1245 int err;
1197 pid_t pid; 1246 pid_t pid;
1198 1247
1199 J_ASSERT(journal_current_handle() == handle); 1248 J_ASSERT(journal_current_handle() == handle);
@@ -1216,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle)
1216 /* 1265 /*
1217 * Implement synchronous transaction batching. If the handle 1266 * Implement synchronous transaction batching. If the handle
1218 * was synchronous, don't force a commit immediately. Let's 1267 * was synchronous, don't force a commit immediately. Let's
1219 * yield and let another thread piggyback onto this transaction. 1268 * yield and let another thread piggyback onto this
1220 * Keep doing that while new threads continue to arrive. 1269 * transaction. Keep doing that while new threads continue to
1221 * It doesn't cost much - we're about to run a commit and sleep 1270 * arrive. It doesn't cost much - we're about to run a commit
1222 * on IO anyway. Speeds up many-threaded, many-dir operations 1271 * and sleep on IO anyway. Speeds up many-threaded, many-dir
1223 * by 30x or more... 1272 * operations by 30x or more...
1224 * 1273 *
1225 * But don't do this if this process was the most recent one to 1274 * We try and optimize the sleep time against what the
1226 * perform a synchronous write. We do this to detect the case where a 1275 * underlying disk can do, instead of having a static sleep
1227 * single process is doing a stream of sync writes. No point in waiting 1276 * time. This is useful for the case where our storage is so
1228 * for joiners in that case. 1277 * fast that it is more optimal to go ahead and force a flush
1278 * and wait for the transaction to be committed than it is to
1279 * wait for an arbitrary amount of time for new writers to
1280 * join the transaction. We achieve this by measuring how
1281 * long it takes to commit a transaction, and compare it with
1282 * how long this transaction has been running, and if run time
1283 * < commit time then we sleep for the delta and commit. This
1284 * greatly helps super fast disks that would see slowdowns as
1285 * more threads started doing fsyncs.
1286 *
1287 * But don't do this if this process was the most recent one
1288 * to perform a synchronous write. We do this to detect the
1289 * case where a single process is doing a stream of sync
1290 * writes. No point in waiting for joiners in that case.
1229 */ 1291 */
1230 pid = current->pid; 1292 pid = current->pid;
1231 if (handle->h_sync && journal->j_last_sync_writer != pid) { 1293 if (handle->h_sync && journal->j_last_sync_writer != pid) {
1294 u64 commit_time, trans_time;
1295
1232 journal->j_last_sync_writer = pid; 1296 journal->j_last_sync_writer = pid;
1233 do { 1297
1234 old_handle_count = transaction->t_handle_count; 1298 spin_lock(&journal->j_state_lock);
1235 schedule_timeout_uninterruptible(1); 1299 commit_time = journal->j_average_commit_time;
1236 } while (old_handle_count != transaction->t_handle_count); 1300 spin_unlock(&journal->j_state_lock);
1301
1302 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1303 transaction->t_start_time));
1304
1305 commit_time = max_t(u64, commit_time,
1306 1000*journal->j_min_batch_time);
1307 commit_time = min_t(u64, commit_time,
1308 1000*journal->j_max_batch_time);
1309
1310 if (trans_time < commit_time) {
1311 ktime_t expires = ktime_add_ns(ktime_get(),
1312 commit_time);
1313 set_current_state(TASK_UNINTERRUPTIBLE);
1314 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1315 }
1237 } 1316 }
1238 1317
1239 current->journal_info = NULL; 1318 current->journal_info = NULL;