aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_trans_ail.c
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@infradead.org>2012-04-23 01:58:39 -0400
committerBen Myers <bpm@sgi.com>2012-05-14 17:20:31 -0400
commit43ff2122e6492bcc88b065c433453dce88223b30 (patch)
tree0f762cfb753edd73402b8830e0927d9efba30c61 /fs/xfs/xfs_trans_ail.c
parent960c60af8b9481595e68875e79b2602e73169c29 (diff)
xfs: on-stack delayed write buffer lists
Queue delwri buffers on a local on-stack list instead of a per-buftarg one, and write back the buffers per-process instead of by waking up xfsbufd. This is now easily doable given that we have very few places left that write delwri buffers: - log recovery: Only done at mount time, and already forcing out the buffers synchronously using xfs_flush_buftarg - quotacheck: Same story. - dquot reclaim: Writes out dirty dquots on the LRU under memory pressure. We might want to look into doing more of this via xfsaild, but it's already more optimal than the synchronous inode reclaim that writes each buffer synchronously. - xfsaild: This is the main beneficiary of the change. By keeping a local list of buffers to write we reduce latency of writing out buffers, and more importably we can remove all the delwri list promotions which were hitting the buffer cache hard under sustained metadata loads. The implementation is very straight forward - xfs_buf_delwri_queue now gets a new list_head pointer that it adds the delwri buffers to, and all callers need to eventually submit the list using xfs_buf_delwi_submit or xfs_buf_delwi_submit_nowait. Buffers that already are on a delwri list are skipped in xfs_buf_delwri_queue, assuming they already are on another delwri list. The biggest change to pass down the buffer list was done to the AIL pushing. Now that we operate on buffers the trylock, push and pushbuf log item methods are merged into a single push routine, which tries to lock the item, and if possible add the buffer that needs writeback to the buffer list. This leads to much simpler code than the previous split but requires the individual IOP_PUSH instances to unlock and reacquire the AIL around calls to blocking routines. Given that xfsailds now also handle writing out buffers, the conditions for log forcing and the sleep times needed some small changes. The most important one is that we consider an AIL busy as long we still have buffers to push, and the other one is that we do increment the pushed LSN for buffers that are under flushing at this moment, but still count them towards the stuck items for restart purposes. Without this we could hammer on stuck items without ever forcing the log and not make progress under heavy random delete workloads on fast flash storage devices. [ Dave Chinner: - rebase on previous patches. - improved comments for XBF_DELWRI_Q handling - fix XBF_ASYNC handling in queue submission (test 106 failure) - rename delwri submit function buffer list parameters for clarity - xfs_efd_item_push() should return XFS_ITEM_PINNED ] Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_trans_ail.c')
-rw-r--r--fs/xfs/xfs_trans_ail.c129
1 files changed, 57 insertions, 72 deletions
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 0425ca16738b..49d9cde33bb3 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -364,29 +364,31 @@ xfsaild_push(
364 xfs_log_item_t *lip; 364 xfs_log_item_t *lip;
365 xfs_lsn_t lsn; 365 xfs_lsn_t lsn;
366 xfs_lsn_t target; 366 xfs_lsn_t target;
367 long tout = 10; 367 long tout;
368 int stuck = 0; 368 int stuck = 0;
369 int flushing = 0;
369 int count = 0; 370 int count = 0;
370 int push_xfsbufd = 0;
371 371
372 /* 372 /*
373 * If last time we ran we encountered pinned items, force the log first 373 * If we encountered pinned items or did not finish writing out all
374 * and wait for it before pushing again. 374 * buffers the last time we ran, force the log first and wait for it
375 * before pushing again.
375 */ 376 */
376 spin_lock(&ailp->xa_lock); 377 if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 &&
377 if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush && 378 (!list_empty_careful(&ailp->xa_buf_list) ||
378 !list_empty(&ailp->xa_ail)) { 379 xfs_ail_min_lsn(ailp))) {
379 ailp->xa_log_flush = 0; 380 ailp->xa_log_flush = 0;
380 spin_unlock(&ailp->xa_lock); 381
381 XFS_STATS_INC(xs_push_ail_flush); 382 XFS_STATS_INC(xs_push_ail_flush);
382 xfs_log_force(mp, XFS_LOG_SYNC); 383 xfs_log_force(mp, XFS_LOG_SYNC);
383 spin_lock(&ailp->xa_lock);
384 } 384 }
385 385
386 spin_lock(&ailp->xa_lock);
386 lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); 387 lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn);
387 if (!lip) { 388 if (!lip) {
388 /* 389 /*
389 * AIL is empty or our push has reached the end. 390 * If the AIL is empty or our push has reached the end we are
391 * done now.
390 */ 392 */
391 xfs_trans_ail_cursor_done(ailp, &cur); 393 xfs_trans_ail_cursor_done(ailp, &cur);
392 spin_unlock(&ailp->xa_lock); 394 spin_unlock(&ailp->xa_lock);
@@ -395,55 +397,42 @@ xfsaild_push(
395 397
396 XFS_STATS_INC(xs_push_ail); 398 XFS_STATS_INC(xs_push_ail);
397 399
398 /*
399 * While the item we are looking at is below the given threshold
400 * try to flush it out. We'd like not to stop until we've at least
401 * tried to push on everything in the AIL with an LSN less than
402 * the given threshold.
403 *
404 * However, we will stop after a certain number of pushes and wait
405 * for a reduced timeout to fire before pushing further. This
406 * prevents use from spinning when we can't do anything or there is
407 * lots of contention on the AIL lists.
408 */
409 lsn = lip->li_lsn; 400 lsn = lip->li_lsn;
410 target = ailp->xa_target; 401 target = ailp->xa_target;
411 while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { 402 while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
412 int lock_result; 403 int lock_result;
404
413 /* 405 /*
414 * If we can lock the item without sleeping, unlock the AIL 406 * Note that IOP_PUSH may unlock and reacquire the AIL lock. We
415 * lock and flush the item. Then re-grab the AIL lock so we 407 * rely on the AIL cursor implementation to be able to deal with
416 * can look for the next item on the AIL. List changes are 408 * the dropped lock.
417 * handled by the AIL lookup functions internally
418 *
419 * If we can't lock the item, either its holder will flush it
420 * or it is already being flushed or it is being relogged. In
421 * any of these case it is being taken care of and we can just
422 * skip to the next item in the list.
423 */ 409 */
424 lock_result = IOP_TRYLOCK(lip); 410 lock_result = IOP_PUSH(lip, &ailp->xa_buf_list);
425 spin_unlock(&ailp->xa_lock);
426 switch (lock_result) { 411 switch (lock_result) {
427 case XFS_ITEM_SUCCESS: 412 case XFS_ITEM_SUCCESS:
428 XFS_STATS_INC(xs_push_ail_success); 413 XFS_STATS_INC(xs_push_ail_success);
429 trace_xfs_ail_push(lip); 414 trace_xfs_ail_push(lip);
430 415
431 IOP_PUSH(lip);
432 ailp->xa_last_pushed_lsn = lsn; 416 ailp->xa_last_pushed_lsn = lsn;
433 break; 417 break;
434 418
435 case XFS_ITEM_PUSHBUF: 419 case XFS_ITEM_FLUSHING:
436 XFS_STATS_INC(xs_push_ail_pushbuf); 420 /*
437 trace_xfs_ail_pushbuf(lip); 421 * The item or its backing buffer is already beeing
438 422 * flushed. The typical reason for that is that an
439 if (!IOP_PUSHBUF(lip)) { 423 * inode buffer is locked because we already pushed the
440 trace_xfs_ail_pushbuf_pinned(lip); 424 * updates to it as part of inode clustering.
441 stuck++; 425 *
442 ailp->xa_log_flush++; 426 * We do not want to to stop flushing just because lots
443 } else { 427 * of items are already beeing flushed, but we need to
444 ailp->xa_last_pushed_lsn = lsn; 428 * re-try the flushing relatively soon if most of the
445 } 429 * AIL is beeing flushed.
446 push_xfsbufd = 1; 430 */
431 XFS_STATS_INC(xs_push_ail_flushing);
432 trace_xfs_ail_flushing(lip);
433
434 flushing++;
435 ailp->xa_last_pushed_lsn = lsn;
447 break; 436 break;
448 437
449 case XFS_ITEM_PINNED: 438 case XFS_ITEM_PINNED:
@@ -453,23 +442,22 @@ xfsaild_push(
453 stuck++; 442 stuck++;
454 ailp->xa_log_flush++; 443 ailp->xa_log_flush++;
455 break; 444 break;
456
457 case XFS_ITEM_LOCKED: 445 case XFS_ITEM_LOCKED:
458 XFS_STATS_INC(xs_push_ail_locked); 446 XFS_STATS_INC(xs_push_ail_locked);
459 trace_xfs_ail_locked(lip); 447 trace_xfs_ail_locked(lip);
448
460 stuck++; 449 stuck++;
461 break; 450 break;
462
463 default: 451 default:
464 ASSERT(0); 452 ASSERT(0);
465 break; 453 break;
466 } 454 }
467 455
468 spin_lock(&ailp->xa_lock);
469 count++; 456 count++;
470 457
471 /* 458 /*
472 * Are there too many items we can't do anything with? 459 * Are there too many items we can't do anything with?
460 *
473 * If we we are skipping too many items because we can't flush 461 * If we we are skipping too many items because we can't flush
474 * them or they are already being flushed, we back off and 462 * them or they are already being flushed, we back off and
475 * given them time to complete whatever operation is being 463 * given them time to complete whatever operation is being
@@ -491,42 +479,36 @@ xfsaild_push(
491 xfs_trans_ail_cursor_done(ailp, &cur); 479 xfs_trans_ail_cursor_done(ailp, &cur);
492 spin_unlock(&ailp->xa_lock); 480 spin_unlock(&ailp->xa_lock);
493 481
494 if (push_xfsbufd) { 482 if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list))
495 /* we've got delayed write buffers to flush */ 483 ailp->xa_log_flush++;
496 wake_up_process(mp->m_ddev_targp->bt_task);
497 }
498 484
499 /* assume we have more work to do in a short while */ 485 if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
500out_done: 486out_done:
501 if (!count) {
502 /* We're past our target or empty, so idle */
503 ailp->xa_last_pushed_lsn = 0;
504 ailp->xa_log_flush = 0;
505
506 tout = 50;
507 } else if (XFS_LSN_CMP(lsn, target) >= 0) {
508 /* 487 /*
509 * We reached the target so wait a bit longer for I/O to 488 * We reached the target or the AIL is empty, so wait a bit
510 * complete and remove pushed items from the AIL before we 489 * longer for I/O to complete and remove pushed items from the
511 * start the next scan from the start of the AIL. 490 * AIL before we start the next scan from the start of the AIL.
512 */ 491 */
513 tout = 50; 492 tout = 50;
514 ailp->xa_last_pushed_lsn = 0; 493 ailp->xa_last_pushed_lsn = 0;
515 } else if ((stuck * 100) / count > 90) { 494 } else if (((stuck + flushing) * 100) / count > 90) {
516 /* 495 /*
517 * Either there is a lot of contention on the AIL or we 496 * Either there is a lot of contention on the AIL or we are
518 * are stuck due to operations in progress. "Stuck" in this 497 * stuck due to operations in progress. "Stuck" in this case
519 * case is defined as >90% of the items we tried to push 498 * is defined as >90% of the items we tried to push were stuck.
520 * were stuck.
521 * 499 *
522 * Backoff a bit more to allow some I/O to complete before 500 * Backoff a bit more to allow some I/O to complete before
523 * restarting from the start of the AIL. This prevents us 501 * restarting from the start of the AIL. This prevents us from
524 * from spinning on the same items, and if they are pinned will 502 * spinning on the same items, and if they are pinned will all
525 * all the restart to issue a log force to unpin the stuck 503 * the restart to issue a log force to unpin the stuck items.
526 * items.
527 */ 504 */
528 tout = 20; 505 tout = 20;
529 ailp->xa_last_pushed_lsn = 0; 506 ailp->xa_last_pushed_lsn = 0;
507 } else {
508 /*
509 * Assume we have more work to do in a short while.
510 */
511 tout = 10;
530 } 512 }
531 513
532 return tout; 514 return tout;
@@ -539,6 +521,8 @@ xfsaild(
539 struct xfs_ail *ailp = data; 521 struct xfs_ail *ailp = data;
540 long tout = 0; /* milliseconds */ 522 long tout = 0; /* milliseconds */
541 523
524 current->flags |= PF_MEMALLOC;
525
542 while (!kthread_should_stop()) { 526 while (!kthread_should_stop()) {
543 if (tout && tout <= 20) 527 if (tout && tout <= 20)
544 __set_current_state(TASK_KILLABLE); 528 __set_current_state(TASK_KILLABLE);
@@ -794,6 +778,7 @@ xfs_trans_ail_init(
794 INIT_LIST_HEAD(&ailp->xa_ail); 778 INIT_LIST_HEAD(&ailp->xa_ail);
795 INIT_LIST_HEAD(&ailp->xa_cursors); 779 INIT_LIST_HEAD(&ailp->xa_cursors);
796 spin_lock_init(&ailp->xa_lock); 780 spin_lock_init(&ailp->xa_lock);
781 INIT_LIST_HEAD(&ailp->xa_buf_list);
797 init_waitqueue_head(&ailp->xa_empty); 782 init_waitqueue_head(&ailp->xa_empty);
798 783
799 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", 784 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",