aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page-writeback.c
diff options
context:
space:
mode:
authorChris Metcalf <cmetcalf@tilera.com>2010-08-13 19:59:15 -0400
committerChris Metcalf <cmetcalf@tilera.com>2010-08-13 19:59:15 -0400
commit7d72e6fa56c4100b9669efe0044f77ed9eb785a1 (patch)
tree5e90bf4969809a1ab20b97432b85be20ccfaa1f4 /mm/page-writeback.c
parentba00376b0b13f234d839541a7b36a5bf5c2a4036 (diff)
parent2be1f3a73dd02e38e181cf5abacb3d45a6a2d6b8 (diff)
Merge branch 'master' into for-linus
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r--mm/page-writeback.c185
1 files changed, 91 insertions, 94 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0c6258bd1ba3..20890d80c7ef 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -253,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
253 } 253 }
254} 254}
255 255
256/*
257 * Clip the earned share of dirty pages to that which is actually available.
258 * This avoids exceeding the total dirty_limit when the floating averages
259 * fluctuate too quickly.
260 */
261static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
262 unsigned long dirty, unsigned long *pbdi_dirty)
263{
264 unsigned long avail_dirty;
265
266 avail_dirty = global_page_state(NR_FILE_DIRTY) +
267 global_page_state(NR_WRITEBACK) +
268 global_page_state(NR_UNSTABLE_NFS) +
269 global_page_state(NR_WRITEBACK_TEMP);
270
271 if (avail_dirty < dirty)
272 avail_dirty = dirty - avail_dirty;
273 else
274 avail_dirty = 0;
275
276 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
277 bdi_stat(bdi, BDI_WRITEBACK);
278
279 *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
280}
281
282static inline void task_dirties_fraction(struct task_struct *tsk, 256static inline void task_dirties_fraction(struct task_struct *tsk,
283 long *numerator, long *denominator) 257 long *numerator, long *denominator)
284{ 258{
@@ -287,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
287} 261}
288 262
289/* 263/*
290 * scale the dirty limit 264 * task_dirty_limit - scale down dirty throttling threshold for one task
291 * 265 *
292 * task specific dirty limit: 266 * task specific dirty limit:
293 * 267 *
294 * dirty -= (dirty/8) * p_{t} 268 * dirty -= (dirty/8) * p_{t}
269 *
270 * To protect light/slow dirtying tasks from heavier/fast ones, we start
271 * throttling individual tasks before reaching the bdi dirty limit.
272 * Relatively low thresholds will be allocated to heavy dirtiers. So when
273 * dirty pages grow large, heavy dirtiers will be throttled first, which will
274 * effectively curb the growth of dirty pages. Light dirtiers with high enough
275 * dirty threshold may never get throttled.
295 */ 276 */
296static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) 277static unsigned long task_dirty_limit(struct task_struct *tsk,
278 unsigned long bdi_dirty)
297{ 279{
298 long numerator, denominator; 280 long numerator, denominator;
299 unsigned long dirty = *pdirty; 281 unsigned long dirty = bdi_dirty;
300 u64 inv = dirty >> 3; 282 u64 inv = dirty >> 3;
301 283
302 task_dirties_fraction(tsk, &numerator, &denominator); 284 task_dirties_fraction(tsk, &numerator, &denominator);
@@ -304,10 +286,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
304 do_div(inv, denominator); 286 do_div(inv, denominator);
305 287
306 dirty -= inv; 288 dirty -= inv;
307 if (dirty < *pdirty/2)
308 dirty = *pdirty/2;
309 289
310 *pdirty = dirty; 290 return max(dirty, bdi_dirty/2);
311} 291}
312 292
313/* 293/*
@@ -417,9 +397,16 @@ unsigned long determine_dirtyable_memory(void)
417 return x + 1; /* Ensure that we never return 0 */ 397 return x + 1; /* Ensure that we never return 0 */
418} 398}
419 399
420void 400/**
421get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, 401 * global_dirty_limits - background-writeback and dirty-throttling thresholds
422 unsigned long *pbdi_dirty, struct backing_dev_info *bdi) 402 *
403 * Calculate the dirty thresholds based on sysctl parameters
404 * - vm.dirty_background_ratio or vm.dirty_background_bytes
405 * - vm.dirty_ratio or vm.dirty_bytes
406 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
407 * runtime tasks.
408 */
409void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
423{ 410{
424 unsigned long background; 411 unsigned long background;
425 unsigned long dirty; 412 unsigned long dirty;
@@ -451,27 +438,37 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
451 } 438 }
452 *pbackground = background; 439 *pbackground = background;
453 *pdirty = dirty; 440 *pdirty = dirty;
441}
442
443/**
444 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
445 *
446 * Allocate high/low dirty limits to fast/slow devices, in order to prevent
447 * - starving fast devices
448 * - piling up dirty pages (that will take long time to sync) on slow devices
449 *
450 * The bdi's share of dirty limit will be adapting to its throughput and
451 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
452 */
453unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
454{
455 u64 bdi_dirty;
456 long numerator, denominator;
457
458 /*
459 * Calculate this BDI's share of the dirty ratio.
460 */
461 bdi_writeout_fraction(bdi, &numerator, &denominator);
454 462
455 if (bdi) { 463 bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
456 u64 bdi_dirty; 464 bdi_dirty *= numerator;
457 long numerator, denominator; 465 do_div(bdi_dirty, denominator);
458 466
459 /* 467 bdi_dirty += (dirty * bdi->min_ratio) / 100;
460 * Calculate this BDI's share of the dirty ratio. 468 if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
461 */ 469 bdi_dirty = dirty * bdi->max_ratio / 100;
462 bdi_writeout_fraction(bdi, &numerator, &denominator); 470
463 471 return bdi_dirty;
464 bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
465 bdi_dirty *= numerator;
466 do_div(bdi_dirty, denominator);
467 bdi_dirty += (dirty * bdi->min_ratio) / 100;
468 if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
469 bdi_dirty = dirty * bdi->max_ratio / 100;
470
471 *pbdi_dirty = bdi_dirty;
472 clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
473 task_dirty_limit(current, pbdi_dirty);
474 }
475} 472}
476 473
477/* 474/*
@@ -491,7 +488,7 @@ static void balance_dirty_pages(struct address_space *mapping,
491 unsigned long bdi_thresh; 488 unsigned long bdi_thresh;
492 unsigned long pages_written = 0; 489 unsigned long pages_written = 0;
493 unsigned long pause = 1; 490 unsigned long pause = 1;
494 491 bool dirty_exceeded = false;
495 struct backing_dev_info *bdi = mapping->backing_dev_info; 492 struct backing_dev_info *bdi = mapping->backing_dev_info;
496 493
497 for (;;) { 494 for (;;) {
@@ -502,18 +499,11 @@ static void balance_dirty_pages(struct address_space *mapping,
502 .range_cyclic = 1, 499 .range_cyclic = 1,
503 }; 500 };
504 501
505 get_dirty_limits(&background_thresh, &dirty_thresh,
506 &bdi_thresh, bdi);
507
508 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 502 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
509 global_page_state(NR_UNSTABLE_NFS); 503 global_page_state(NR_UNSTABLE_NFS);
510 nr_writeback = global_page_state(NR_WRITEBACK); 504 nr_writeback = global_page_state(NR_WRITEBACK);
511 505
512 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 506 global_dirty_limits(&background_thresh, &dirty_thresh);
513 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
514
515 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
516 break;
517 507
518 /* 508 /*
519 * Throttle it only when the background writeback cannot 509 * Throttle it only when the background writeback cannot
@@ -524,26 +514,8 @@ static void balance_dirty_pages(struct address_space *mapping,
524 (background_thresh + dirty_thresh) / 2) 514 (background_thresh + dirty_thresh) / 2)
525 break; 515 break;
526 516
527 if (!bdi->dirty_exceeded) 517 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
528 bdi->dirty_exceeded = 1; 518 bdi_thresh = task_dirty_limit(current, bdi_thresh);
529
530 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
531 * Unstable writes are a feature of certain networked
532 * filesystems (i.e. NFS) in which data may have been
533 * written to the server's write cache, but has not yet
534 * been flushed to permanent storage.
535 * Only move pages to writeback if this bdi is over its
536 * threshold otherwise wait until the disk writes catch
537 * up.
538 */
539 trace_wbc_balance_dirty_start(&wbc, bdi);
540 if (bdi_nr_reclaimable > bdi_thresh) {
541 writeback_inodes_wb(&bdi->wb, &wbc);
542 pages_written += write_chunk - wbc.nr_to_write;
543 get_dirty_limits(&background_thresh, &dirty_thresh,
544 &bdi_thresh, bdi);
545 trace_wbc_balance_dirty_written(&wbc, bdi);
546 }
547 519
548 /* 520 /*
549 * In order to avoid the stacked BDI deadlock we need 521 * In order to avoid the stacked BDI deadlock we need
@@ -558,16 +530,44 @@ static void balance_dirty_pages(struct address_space *mapping,
558 if (bdi_thresh < 2*bdi_stat_error(bdi)) { 530 if (bdi_thresh < 2*bdi_stat_error(bdi)) {
559 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); 531 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
560 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); 532 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
561 } else if (bdi_nr_reclaimable) { 533 } else {
562 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 534 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
563 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); 535 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
564 } 536 }
565 537
566 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) 538 /*
539 * The bdi thresh is somehow "soft" limit derived from the
540 * global "hard" limit. The former helps to prevent heavy IO
541 * bdi or process from holding back light ones; The latter is
542 * the last resort safeguard.
543 */
544 dirty_exceeded =
545 (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
546 || (nr_reclaimable + nr_writeback >= dirty_thresh);
547
548 if (!dirty_exceeded)
567 break; 549 break;
568 if (pages_written >= write_chunk)
569 break; /* We've done our duty */
570 550
551 if (!bdi->dirty_exceeded)
552 bdi->dirty_exceeded = 1;
553
554 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
555 * Unstable writes are a feature of certain networked
556 * filesystems (i.e. NFS) in which data may have been
557 * written to the server's write cache, but has not yet
558 * been flushed to permanent storage.
559 * Only move pages to writeback if this bdi is over its
560 * threshold otherwise wait until the disk writes catch
561 * up.
562 */
563 trace_wbc_balance_dirty_start(&wbc, bdi);
564 if (bdi_nr_reclaimable > bdi_thresh) {
565 writeback_inodes_wb(&bdi->wb, &wbc);
566 pages_written += write_chunk - wbc.nr_to_write;
567 trace_wbc_balance_dirty_written(&wbc, bdi);
568 if (pages_written >= write_chunk)
569 break; /* We've done our duty */
570 }
571 trace_wbc_balance_dirty_wait(&wbc, bdi); 571 trace_wbc_balance_dirty_wait(&wbc, bdi);
572 __set_current_state(TASK_INTERRUPTIBLE); 572 __set_current_state(TASK_INTERRUPTIBLE);
573 io_schedule_timeout(pause); 573 io_schedule_timeout(pause);
@@ -581,8 +581,7 @@ static void balance_dirty_pages(struct address_space *mapping,
581 pause = HZ / 10; 581 pause = HZ / 10;
582 } 582 }
583 583
584 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 584 if (!dirty_exceeded && bdi->dirty_exceeded)
585 bdi->dirty_exceeded)
586 bdi->dirty_exceeded = 0; 585 bdi->dirty_exceeded = 0;
587 586
588 if (writeback_in_progress(bdi)) 587 if (writeback_in_progress(bdi))
@@ -597,9 +596,7 @@ static void balance_dirty_pages(struct address_space *mapping,
597 * background_thresh, to keep the amount of dirty memory low. 596 * background_thresh, to keep the amount of dirty memory low.
598 */ 597 */
599 if ((laptop_mode && pages_written) || 598 if ((laptop_mode && pages_written) ||
600 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) 599 (!laptop_mode && (nr_reclaimable > background_thresh)))
601 + global_page_state(NR_UNSTABLE_NFS))
602 > background_thresh)))
603 bdi_start_background_writeback(bdi); 600 bdi_start_background_writeback(bdi);
604} 601}
605 602
@@ -663,7 +660,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
663 unsigned long dirty_thresh; 660 unsigned long dirty_thresh;
664 661
665 for ( ; ; ) { 662 for ( ; ; ) {
666 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); 663 global_dirty_limits(&background_thresh, &dirty_thresh);
667 664
668 /* 665 /*
669 * Boost the allowable dirty threshold a bit for page 666 * Boost the allowable dirty threshold a bit for page
@@ -825,10 +822,10 @@ void __init page_writeback_init(void)
825/* 822/*
826 * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. 823 * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
827 */ 824 */
828#define WRITEBACK_TAG_BATCH 4096
829void tag_pages_for_writeback(struct address_space *mapping, 825void tag_pages_for_writeback(struct address_space *mapping,
830 pgoff_t start, pgoff_t end) 826 pgoff_t start, pgoff_t end)
831{ 827{
828#define WRITEBACK_TAG_BATCH 4096
832 unsigned long tagged; 829 unsigned long tagged;
833 830
834 do { 831 do {