aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2010-08-23 21:40:03 -0400
committerDave Chinner <david@fromorbit.com>2010-08-23 21:40:03 -0400
commita44f13edf0ebb4e41942d0f16ca80489dcf6659d (patch)
tree42bcbee56a62851e969292033efd600cced80ca5 /fs
parent1a387d3be2b30c90f20d49a3497a8fc0693a9d18 (diff)
xfs: Reduce log force overhead for delayed logging
Delayed logging adds some serialisation to the log force process to ensure that it does not deference a bad commit context structure when determining if a CIL push is necessary or not. It does this by grabing the CIL context lock exclusively, then dropping it before pushing the CIL if necessary. This causes serialisation of all log forces and pushes regardless of whether a force is necessary or not. As a result fsync heavy workloads (like dbench) can be significantly slower with delayed logging than without. To avoid this penalty, copy the current sequence from the context to the CIL structure when they are swapped. This allows us to do unlocked checks on the current sequence without having to worry about dereferencing context structures that may have already been freed. Hence we can remove the CIL context locking in the forcing code and only call into the push code if the current context matches the sequence we need to force. By passing the sequence into the push code, we can check the sequence again once we have the CIL lock held exclusive and abort if the sequence has already been pushed. This avoids a lock round-trip and unnecessary CIL pushes when we have racing push calls. The result is that the regression in dbench performance goes away - this change improves dbench performance on a ramdisk from ~2100MB/s to ~2500MB/s. This compares favourably to not using delayed logging which retuns ~2500MB/s for the same workload. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/xfs_log.c7
-rw-r--r--fs/xfs/xfs_log_cil.c245
-rw-r--r--fs/xfs/xfs_log_priv.h13
3 files changed, 147 insertions, 118 deletions
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 925d572bf0f4..33f718f92a48 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3015,7 +3015,8 @@ _xfs_log_force(
3015 3015
3016 XFS_STATS_INC(xs_log_force); 3016 XFS_STATS_INC(xs_log_force);
3017 3017
3018 xlog_cil_push(log, 1); 3018 if (log->l_cilp)
3019 xlog_cil_force(log);
3019 3020
3020 spin_lock(&log->l_icloglock); 3021 spin_lock(&log->l_icloglock);
3021 3022
@@ -3167,7 +3168,7 @@ _xfs_log_force_lsn(
3167 XFS_STATS_INC(xs_log_force); 3168 XFS_STATS_INC(xs_log_force);
3168 3169
3169 if (log->l_cilp) { 3170 if (log->l_cilp) {
3170 lsn = xlog_cil_push_lsn(log, lsn); 3171 lsn = xlog_cil_force_lsn(log, lsn);
3171 if (lsn == NULLCOMMITLSN) 3172 if (lsn == NULLCOMMITLSN)
3172 return 0; 3173 return 0;
3173 } 3174 }
@@ -3724,7 +3725,7 @@ xfs_log_force_umount(
3724 * call below. 3725 * call below.
3725 */ 3726 */
3726 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) 3727 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
3727 xlog_cil_push(log, 1); 3728 xlog_cil_force(log);
3728 3729
3729 /* 3730 /*
3730 * We must hold both the GRANT lock and the LOG lock, 3731 * We must hold both the GRANT lock and the LOG lock,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ef8e7d9f445d..9768f2437bb3 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -68,6 +68,7 @@ xlog_cil_init(
68 ctx->sequence = 1; 68 ctx->sequence = 1;
69 ctx->cil = cil; 69 ctx->cil = cil;
70 cil->xc_ctx = ctx; 70 cil->xc_ctx = ctx;
71 cil->xc_current_sequence = ctx->sequence;
71 72
72 cil->xc_log = log; 73 cil->xc_log = log;
73 log->l_cilp = cil; 74 log->l_cilp = cil;
@@ -321,94 +322,6 @@ xlog_cil_free_logvec(
321} 322}
322 323
323/* 324/*
324 * Commit a transaction with the given vector to the Committed Item List.
325 *
326 * To do this, we need to format the item, pin it in memory if required and
327 * account for the space used by the transaction. Once we have done that we
328 * need to release the unused reservation for the transaction, attach the
329 * transaction to the checkpoint context so we carry the busy extents through
330 * to checkpoint completion, and then unlock all the items in the transaction.
331 *
332 * For more specific information about the order of operations in
333 * xfs_log_commit_cil() please refer to the comments in
334 * xfs_trans_commit_iclog().
335 *
336 * Called with the context lock already held in read mode to lock out
337 * background commit, returns without it held once background commits are
338 * allowed again.
339 */
340int
341xfs_log_commit_cil(
342 struct xfs_mount *mp,
343 struct xfs_trans *tp,
344 struct xfs_log_vec *log_vector,
345 xfs_lsn_t *commit_lsn,
346 int flags)
347{
348 struct log *log = mp->m_log;
349 int log_flags = 0;
350 int push = 0;
351
352 if (flags & XFS_TRANS_RELEASE_LOG_RES)
353 log_flags = XFS_LOG_REL_PERM_RESERV;
354
355 if (XLOG_FORCED_SHUTDOWN(log)) {
356 xlog_cil_free_logvec(log_vector);
357 return XFS_ERROR(EIO);
358 }
359
360 /* lock out background commit */
361 down_read(&log->l_cilp->xc_ctx_lock);
362 xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
363
364 /* check we didn't blow the reservation */
365 if (tp->t_ticket->t_curr_res < 0)
366 xlog_print_tic_res(log->l_mp, tp->t_ticket);
367
368 /* attach the transaction to the CIL if it has any busy extents */
369 if (!list_empty(&tp->t_busy)) {
370 spin_lock(&log->l_cilp->xc_cil_lock);
371 list_splice_init(&tp->t_busy,
372 &log->l_cilp->xc_ctx->busy_extents);
373 spin_unlock(&log->l_cilp->xc_cil_lock);
374 }
375
376 tp->t_commit_lsn = *commit_lsn;
377 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
378 xfs_trans_unreserve_and_mod_sb(tp);
379
380 /*
381 * Once all the items of the transaction have been copied to the CIL,
382 * the items can be unlocked and freed.
383 *
384 * This needs to be done before we drop the CIL context lock because we
385 * have to update state in the log items and unlock them before they go
386 * to disk. If we don't, then the CIL checkpoint can race with us and
387 * we can run checkpoint completion before we've updated and unlocked
388 * the log items. This affects (at least) processing of stale buffers,
389 * inodes and EFIs.
390 */
391 xfs_trans_free_items(tp, *commit_lsn, 0);
392
393 /* check for background commit before unlock */
394 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
395 push = 1;
396
397 up_read(&log->l_cilp->xc_ctx_lock);
398
399 /*
400 * We need to push CIL every so often so we don't cache more than we
401 * can fit in the log. The limit really is that a checkpoint can't be
402 * more than half the log (the current checkpoint is not allowed to
403 * overwrite the previous checkpoint), but commit latency and memory
404 * usage limit this to a smaller size in most cases.
405 */
406 if (push)
407 xlog_cil_push(log, 0);
408 return 0;
409}
410
411/*
412 * Mark all items committed and clear busy extents. We free the log vector 325 * Mark all items committed and clear busy extents. We free the log vector
413 * chains in a separate pass so that we unpin the log items as quickly as 326 * chains in a separate pass so that we unpin the log items as quickly as
414 * possible. 327 * possible.
@@ -441,13 +354,23 @@ xlog_cil_committed(
441} 354}
442 355
443/* 356/*
444 * Push the Committed Item List to the log. If the push_now flag is not set, 357 * Push the Committed Item List to the log. If @push_seq flag is zero, then it
445 * then it is a background flush and so we can chose to ignore it. 358 * is a background flush and so we can chose to ignore it. Otherwise, if the
359 * current sequence is the same as @push_seq we need to do a flush. If
360 * @push_seq is less than the current sequence, then it has already been
361 * flushed and we don't need to do anything - the caller will wait for it to
362 * complete if necessary.
363 *
364 * @push_seq is a value rather than a flag because that allows us to do an
365 * unlocked check of the sequence number for a match. Hence we can allows log
366 * forces to run racily and not issue pushes for the same sequence twice. If we
367 * get a race between multiple pushes for the same sequence they will block on
368 * the first one and then abort, hence avoiding needless pushes.
446 */ 369 */
447int 370STATIC int
448xlog_cil_push( 371xlog_cil_push(
449 struct log *log, 372 struct log *log,
450 int push_now) 373 xfs_lsn_t push_seq)
451{ 374{
452 struct xfs_cil *cil = log->l_cilp; 375 struct xfs_cil *cil = log->l_cilp;
453 struct xfs_log_vec *lv; 376 struct xfs_log_vec *lv;
@@ -467,12 +390,14 @@ xlog_cil_push(
467 if (!cil) 390 if (!cil)
468 return 0; 391 return 0;
469 392
393 ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
394
470 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); 395 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
471 new_ctx->ticket = xlog_cil_ticket_alloc(log); 396 new_ctx->ticket = xlog_cil_ticket_alloc(log);
472 397
473 /* lock out transaction commit, but don't block on background push */ 398 /* lock out transaction commit, but don't block on background push */
474 if (!down_write_trylock(&cil->xc_ctx_lock)) { 399 if (!down_write_trylock(&cil->xc_ctx_lock)) {
475 if (!push_now) 400 if (!push_seq)
476 goto out_free_ticket; 401 goto out_free_ticket;
477 down_write(&cil->xc_ctx_lock); 402 down_write(&cil->xc_ctx_lock);
478 } 403 }
@@ -483,7 +408,11 @@ xlog_cil_push(
483 goto out_skip; 408 goto out_skip;
484 409
485 /* check for spurious background flush */ 410 /* check for spurious background flush */
486 if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) 411 if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
412 goto out_skip;
413
414 /* check for a previously pushed seqeunce */
415 if (push_seq < cil->xc_ctx->sequence)
487 goto out_skip; 416 goto out_skip;
488 417
489 /* 418 /*
@@ -529,6 +458,13 @@ xlog_cil_push(
529 cil->xc_ctx = new_ctx; 458 cil->xc_ctx = new_ctx;
530 459
531 /* 460 /*
461 * mirror the new sequence into the cil structure so that we can do
462 * unlocked checks against the current sequence in log forces without
463 * risking deferencing a freed context pointer.
464 */
465 cil->xc_current_sequence = new_ctx->sequence;
466
467 /*
532 * The switch is now done, so we can drop the context lock and move out 468 * The switch is now done, so we can drop the context lock and move out
533 * of a shared context. We can't just go straight to the commit record, 469 * of a shared context. We can't just go straight to the commit record,
534 * though - we need to synchronise with previous and future commits so 470 * though - we need to synchronise with previous and future commits so
@@ -640,6 +576,94 @@ out_abort:
640} 576}
641 577
642/* 578/*
579 * Commit a transaction with the given vector to the Committed Item List.
580 *
581 * To do this, we need to format the item, pin it in memory if required and
582 * account for the space used by the transaction. Once we have done that we
583 * need to release the unused reservation for the transaction, attach the
584 * transaction to the checkpoint context so we carry the busy extents through
585 * to checkpoint completion, and then unlock all the items in the transaction.
586 *
587 * For more specific information about the order of operations in
588 * xfs_log_commit_cil() please refer to the comments in
589 * xfs_trans_commit_iclog().
590 *
591 * Called with the context lock already held in read mode to lock out
592 * background commit, returns without it held once background commits are
593 * allowed again.
594 */
595int
596xfs_log_commit_cil(
597 struct xfs_mount *mp,
598 struct xfs_trans *tp,
599 struct xfs_log_vec *log_vector,
600 xfs_lsn_t *commit_lsn,
601 int flags)
602{
603 struct log *log = mp->m_log;
604 int log_flags = 0;
605 int push = 0;
606
607 if (flags & XFS_TRANS_RELEASE_LOG_RES)
608 log_flags = XFS_LOG_REL_PERM_RESERV;
609
610 if (XLOG_FORCED_SHUTDOWN(log)) {
611 xlog_cil_free_logvec(log_vector);
612 return XFS_ERROR(EIO);
613 }
614
615 /* lock out background commit */
616 down_read(&log->l_cilp->xc_ctx_lock);
617 xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
618
619 /* check we didn't blow the reservation */
620 if (tp->t_ticket->t_curr_res < 0)
621 xlog_print_tic_res(log->l_mp, tp->t_ticket);
622
623 /* attach the transaction to the CIL if it has any busy extents */
624 if (!list_empty(&tp->t_busy)) {
625 spin_lock(&log->l_cilp->xc_cil_lock);
626 list_splice_init(&tp->t_busy,
627 &log->l_cilp->xc_ctx->busy_extents);
628 spin_unlock(&log->l_cilp->xc_cil_lock);
629 }
630
631 tp->t_commit_lsn = *commit_lsn;
632 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
633 xfs_trans_unreserve_and_mod_sb(tp);
634
635 /*
636 * Once all the items of the transaction have been copied to the CIL,
637 * the items can be unlocked and freed.
638 *
639 * This needs to be done before we drop the CIL context lock because we
640 * have to update state in the log items and unlock them before they go
641 * to disk. If we don't, then the CIL checkpoint can race with us and
642 * we can run checkpoint completion before we've updated and unlocked
643 * the log items. This affects (at least) processing of stale buffers,
644 * inodes and EFIs.
645 */
646 xfs_trans_free_items(tp, *commit_lsn, 0);
647
648 /* check for background commit before unlock */
649 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
650 push = 1;
651
652 up_read(&log->l_cilp->xc_ctx_lock);
653
654 /*
655 * We need to push CIL every so often so we don't cache more than we
656 * can fit in the log. The limit really is that a checkpoint can't be
657 * more than half the log (the current checkpoint is not allowed to
658 * overwrite the previous checkpoint), but commit latency and memory
659 * usage limit this to a smaller size in most cases.
660 */
661 if (push)
662 xlog_cil_push(log, 0);
663 return 0;
664}
665
666/*
643 * Conditionally push the CIL based on the sequence passed in. 667 * Conditionally push the CIL based on the sequence passed in.
644 * 668 *
645 * We only need to push if we haven't already pushed the sequence 669 * We only need to push if we haven't already pushed the sequence
@@ -653,39 +677,34 @@ out_abort:
653 * commit lsn is there. It'll be empty, so this is broken for now. 677 * commit lsn is there. It'll be empty, so this is broken for now.
654 */ 678 */
655xfs_lsn_t 679xfs_lsn_t
656xlog_cil_push_lsn( 680xlog_cil_force_lsn(
657 struct log *log, 681 struct log *log,
658 xfs_lsn_t push_seq) 682 xfs_lsn_t sequence)
659{ 683{
660 struct xfs_cil *cil = log->l_cilp; 684 struct xfs_cil *cil = log->l_cilp;
661 struct xfs_cil_ctx *ctx; 685 struct xfs_cil_ctx *ctx;
662 xfs_lsn_t commit_lsn = NULLCOMMITLSN; 686 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
663 687
664restart: 688 ASSERT(sequence <= cil->xc_current_sequence);
665 down_write(&cil->xc_ctx_lock); 689
666 ASSERT(push_seq <= cil->xc_ctx->sequence); 690 /*
667 691 * check to see if we need to force out the current context.
668 /* check to see if we need to force out the current context */ 692 * xlog_cil_push() handles racing pushes for the same sequence,
669 if (push_seq == cil->xc_ctx->sequence) { 693 * so no need to deal with it here.
670 up_write(&cil->xc_ctx_lock); 694 */
671 xlog_cil_push(log, 1); 695 if (sequence == cil->xc_current_sequence)
672 goto restart; 696 xlog_cil_push(log, sequence);
673 }
674 697
675 /* 698 /*
676 * See if we can find a previous sequence still committing. 699 * See if we can find a previous sequence still committing.
677 * We can drop the flush lock as soon as we have the cil lock
678 * because we are now only comparing contexts protected by
679 * the cil lock.
680 *
681 * We need to wait for all previous sequence commits to complete 700 * We need to wait for all previous sequence commits to complete
682 * before allowing the force of push_seq to go ahead. Hence block 701 * before allowing the force of push_seq to go ahead. Hence block
683 * on commits for those as well. 702 * on commits for those as well.
684 */ 703 */
704restart:
685 spin_lock(&cil->xc_cil_lock); 705 spin_lock(&cil->xc_cil_lock);
686 up_write(&cil->xc_ctx_lock);
687 list_for_each_entry(ctx, &cil->xc_committing, committing) { 706 list_for_each_entry(ctx, &cil->xc_committing, committing) {
688 if (ctx->sequence > push_seq) 707 if (ctx->sequence > sequence)
689 continue; 708 continue;
690 if (!ctx->commit_lsn) { 709 if (!ctx->commit_lsn) {
691 /* 710 /*
@@ -695,7 +714,7 @@ restart:
695 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); 714 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
696 goto restart; 715 goto restart;
697 } 716 }
698 if (ctx->sequence != push_seq) 717 if (ctx->sequence != sequence)
699 continue; 718 continue;
700 /* found it! */ 719 /* found it! */
701 commit_lsn = ctx->commit_lsn; 720 commit_lsn = ctx->commit_lsn;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8c072618965c..ced52b98b322 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -422,6 +422,7 @@ struct xfs_cil {
422 struct rw_semaphore xc_ctx_lock; 422 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing; 423 struct list_head xc_committing;
424 sv_t xc_commit_wait; 424 sv_t xc_commit_wait;
425 xfs_lsn_t xc_current_sequence;
425}; 426};
426 427
427/* 428/*
@@ -562,8 +563,16 @@ int xlog_cil_init(struct log *log);
562void xlog_cil_init_post_recovery(struct log *log); 563void xlog_cil_init_post_recovery(struct log *log);
563void xlog_cil_destroy(struct log *log); 564void xlog_cil_destroy(struct log *log);
564 565
565int xlog_cil_push(struct log *log, int push_now); 566/*
566xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); 567 * CIL force routines
568 */
569xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
570
571static inline void
572xlog_cil_force(struct log *log)
573{
574 xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
575}
567 576
568/* 577/*
569 * Unmount record type is used as a pseudo transaction type for the ticket. 578 * Unmount record type is used as a pseudo transaction type for the ticket.