aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorWu Fengguang <fengguang.wu@intel.com>2010-08-11 17:17:37 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-12 11:43:29 -0400
commite50e37201ae2e7d6a52e87815759e6481f0bcfb9 (patch)
treeefb500382d5e9628351cb16286f579ad9bd455db /mm
parenta292dfa01794477126d3f022559eb235edde00b0 (diff)
writeback: balance_dirty_pages(): reduce calls to global_page_state
Reducing the number of times balance_dirty_pages calls global_page_state reduces the cache references and so improves write performance on a variety of workloads. 'perf stats' of simple fio write tests shows the reduction in cache access. Where the test is fio 'write,mmap,600Mb,pre_read' on AMD AthlonX2 with 3Gb memory (dirty_threshold approx 600 Mb) running each test 10 times, dropping the fasted & slowest values then taking the average & standard deviation average (s.d.) in millions (10^6) 2.6.31-rc8 648.6 (14.6) +patch 620.1 (16.5) Achieving this reduction is by dropping clip_bdi_dirty_limit as it rereads the counters to apply the dirty_threshold and moving this check up into balance_dirty_pages where it has already read the counters. Also by rearrange the for loop to only contain one copy of the limit tests allows the pdflush test after the loop to use the local copies of the counters rather than rereading them. In the common case with no throttling it now calls global_page_state 5 fewer times and bdi_stat 2 fewer. Fengguang: This patch slightly changes behavior by replacing clip_bdi_dirty_limit() with the explicit check (nr_reclaimable + nr_writeback >= dirty_thresh) to avoid exceeding the dirty limit. Since the bdi dirty limit is mostly accurate we don't need to do routinely clip. A simple dirty limit check would be enough. The check is necessary because, in principle we should throttle everything calling balance_dirty_pages() when we're over the total limit, as said by Peter. We now set and clear dirty_exceeded not only based on bdi dirty limits, but also on the global dirty limit. The global limit check is added in place of clip_bdi_dirty_limit() for safety and not intended as a behavior change. The bdi limits should be tight enough to keep all dirty pages under the global limit at most time; occasional small exceeding should be OK though. The change makes the logic more obvious: the global limit is the ultimate goal and shall be always imposed. We may now start background writeback work based on outdated conditions. That's safe because the bdi flush thread will (and have to) double check the states. It reduces overall overheads because the test based on old states still have good chance to be right. [akpm@linux-foundation.org] fix uninitialized dirty_exceeded Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com> Cc: Jan Kara <jack@suse.cz> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Hellwig <hch@infradead.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Jens Axboe <axboe@kernel.dk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/page-writeback.c95
1 files changed, 33 insertions, 62 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ea0b7cb4a8c7..2cf69a5e46e6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -253,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
253 } 253 }
254} 254}
255 255
256/*
257 * Clip the earned share of dirty pages to that which is actually available.
258 * This avoids exceeding the total dirty_limit when the floating averages
259 * fluctuate too quickly.
260 */
261static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
262 unsigned long dirty, unsigned long *pbdi_dirty)
263{
264 unsigned long avail_dirty;
265
266 avail_dirty = global_page_state(NR_FILE_DIRTY) +
267 global_page_state(NR_WRITEBACK) +
268 global_page_state(NR_UNSTABLE_NFS) +
269 global_page_state(NR_WRITEBACK_TEMP);
270
271 if (avail_dirty < dirty)
272 avail_dirty = dirty - avail_dirty;
273 else
274 avail_dirty = 0;
275
276 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
277 bdi_stat(bdi, BDI_WRITEBACK);
278
279 *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
280}
281
282static inline void task_dirties_fraction(struct task_struct *tsk, 256static inline void task_dirties_fraction(struct task_struct *tsk,
283 long *numerator, long *denominator) 257 long *numerator, long *denominator)
284{ 258{
@@ -469,7 +443,6 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
469 bdi_dirty = dirty * bdi->max_ratio / 100; 443 bdi_dirty = dirty * bdi->max_ratio / 100;
470 444
471 *pbdi_dirty = bdi_dirty; 445 *pbdi_dirty = bdi_dirty;
472 clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
473 task_dirty_limit(current, pbdi_dirty); 446 task_dirty_limit(current, pbdi_dirty);
474 } 447 }
475} 448}
@@ -491,7 +464,7 @@ static void balance_dirty_pages(struct address_space *mapping,
491 unsigned long bdi_thresh; 464 unsigned long bdi_thresh;
492 unsigned long pages_written = 0; 465 unsigned long pages_written = 0;
493 unsigned long pause = 1; 466 unsigned long pause = 1;
494 467 bool dirty_exceeded = false;
495 struct backing_dev_info *bdi = mapping->backing_dev_info; 468 struct backing_dev_info *bdi = mapping->backing_dev_info;
496 469
497 for (;;) { 470 for (;;) {
@@ -509,10 +482,35 @@ static void balance_dirty_pages(struct address_space *mapping,
509 global_page_state(NR_UNSTABLE_NFS); 482 global_page_state(NR_UNSTABLE_NFS);
510 nr_writeback = global_page_state(NR_WRITEBACK); 483 nr_writeback = global_page_state(NR_WRITEBACK);
511 484
512 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 485 /*
513 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); 486 * In order to avoid the stacked BDI deadlock we need
487 * to ensure we accurately count the 'dirty' pages when
488 * the threshold is low.
489 *
490 * Otherwise it would be possible to get thresh+n pages
491 * reported dirty, even though there are thresh-m pages
492 * actually dirty; with m+n sitting in the percpu
493 * deltas.
494 */
495 if (bdi_thresh < 2*bdi_stat_error(bdi)) {
496 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
497 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
498 } else {
499 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
500 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
501 }
514 502
515 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) 503 /*
504 * The bdi thresh is somehow "soft" limit derived from the
505 * global "hard" limit. The former helps to prevent heavy IO
506 * bdi or process from holding back light ones; The latter is
507 * the last resort safeguard.
508 */
509 dirty_exceeded =
510 (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
511 || (nr_reclaimable + nr_writeback >= dirty_thresh);
512
513 if (!dirty_exceeded)
516 break; 514 break;
517 515
518 /* 516 /*
@@ -540,34 +538,10 @@ static void balance_dirty_pages(struct address_space *mapping,
540 if (bdi_nr_reclaimable > bdi_thresh) { 538 if (bdi_nr_reclaimable > bdi_thresh) {
541 writeback_inodes_wb(&bdi->wb, &wbc); 539 writeback_inodes_wb(&bdi->wb, &wbc);
542 pages_written += write_chunk - wbc.nr_to_write; 540 pages_written += write_chunk - wbc.nr_to_write;
543 get_dirty_limits(&background_thresh, &dirty_thresh,
544 &bdi_thresh, bdi);
545 trace_wbc_balance_dirty_written(&wbc, bdi); 541 trace_wbc_balance_dirty_written(&wbc, bdi);
542 if (pages_written >= write_chunk)
543 break; /* We've done our duty */
546 } 544 }
547
548 /*
549 * In order to avoid the stacked BDI deadlock we need
550 * to ensure we accurately count the 'dirty' pages when
551 * the threshold is low.
552 *
553 * Otherwise it would be possible to get thresh+n pages
554 * reported dirty, even though there are thresh-m pages
555 * actually dirty; with m+n sitting in the percpu
556 * deltas.
557 */
558 if (bdi_thresh < 2*bdi_stat_error(bdi)) {
559 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
560 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
561 } else if (bdi_nr_reclaimable) {
562 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
563 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
564 }
565
566 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
567 break;
568 if (pages_written >= write_chunk)
569 break; /* We've done our duty */
570
571 trace_wbc_balance_dirty_wait(&wbc, bdi); 545 trace_wbc_balance_dirty_wait(&wbc, bdi);
572 __set_current_state(TASK_INTERRUPTIBLE); 546 __set_current_state(TASK_INTERRUPTIBLE);
573 io_schedule_timeout(pause); 547 io_schedule_timeout(pause);
@@ -581,8 +555,7 @@ static void balance_dirty_pages(struct address_space *mapping,
581 pause = HZ / 10; 555 pause = HZ / 10;
582 } 556 }
583 557
584 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 558 if (!dirty_exceeded && bdi->dirty_exceeded)
585 bdi->dirty_exceeded)
586 bdi->dirty_exceeded = 0; 559 bdi->dirty_exceeded = 0;
587 560
588 if (writeback_in_progress(bdi)) 561 if (writeback_in_progress(bdi))
@@ -597,9 +570,7 @@ static void balance_dirty_pages(struct address_space *mapping,
597 * background_thresh, to keep the amount of dirty memory low. 570 * background_thresh, to keep the amount of dirty memory low.
598 */ 571 */
599 if ((laptop_mode && pages_written) || 572 if ((laptop_mode && pages_written) ||
600 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) 573 (!laptop_mode && (nr_reclaimable > background_thresh)))
601 + global_page_state(NR_UNSTABLE_NFS))
602 > background_thresh)))
603 bdi_start_background_writeback(bdi); 574 bdi_start_background_writeback(bdi);
604} 575}
605 576