diff options
author | Chris Metcalf <cmetcalf@tilera.com> | 2010-08-13 19:59:15 -0400 |
---|---|---|
committer | Chris Metcalf <cmetcalf@tilera.com> | 2010-08-13 19:59:15 -0400 |
commit | 7d72e6fa56c4100b9669efe0044f77ed9eb785a1 (patch) | |
tree | 5e90bf4969809a1ab20b97432b85be20ccfaa1f4 /mm/page-writeback.c | |
parent | ba00376b0b13f234d839541a7b36a5bf5c2a4036 (diff) | |
parent | 2be1f3a73dd02e38e181cf5abacb3d45a6a2d6b8 (diff) |
Merge branch 'master' into for-linus
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r-- | mm/page-writeback.c | 185 |
1 files changed, 91 insertions, 94 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0c6258bd1ba3..20890d80c7ef 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -253,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, | |||
253 | } | 253 | } |
254 | } | 254 | } |
255 | 255 | ||
256 | /* | ||
257 | * Clip the earned share of dirty pages to that which is actually available. | ||
258 | * This avoids exceeding the total dirty_limit when the floating averages | ||
259 | * fluctuate too quickly. | ||
260 | */ | ||
261 | static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, | ||
262 | unsigned long dirty, unsigned long *pbdi_dirty) | ||
263 | { | ||
264 | unsigned long avail_dirty; | ||
265 | |||
266 | avail_dirty = global_page_state(NR_FILE_DIRTY) + | ||
267 | global_page_state(NR_WRITEBACK) + | ||
268 | global_page_state(NR_UNSTABLE_NFS) + | ||
269 | global_page_state(NR_WRITEBACK_TEMP); | ||
270 | |||
271 | if (avail_dirty < dirty) | ||
272 | avail_dirty = dirty - avail_dirty; | ||
273 | else | ||
274 | avail_dirty = 0; | ||
275 | |||
276 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + | ||
277 | bdi_stat(bdi, BDI_WRITEBACK); | ||
278 | |||
279 | *pbdi_dirty = min(*pbdi_dirty, avail_dirty); | ||
280 | } | ||
281 | |||
282 | static inline void task_dirties_fraction(struct task_struct *tsk, | 256 | static inline void task_dirties_fraction(struct task_struct *tsk, |
283 | long *numerator, long *denominator) | 257 | long *numerator, long *denominator) |
284 | { | 258 | { |
@@ -287,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk, | |||
287 | } | 261 | } |
288 | 262 | ||
289 | /* | 263 | /* |
290 | * scale the dirty limit | 264 | * task_dirty_limit - scale down dirty throttling threshold for one task |
291 | * | 265 | * |
292 | * task specific dirty limit: | 266 | * task specific dirty limit: |
293 | * | 267 | * |
294 | * dirty -= (dirty/8) * p_{t} | 268 | * dirty -= (dirty/8) * p_{t} |
269 | * | ||
270 | * To protect light/slow dirtying tasks from heavier/fast ones, we start | ||
271 | * throttling individual tasks before reaching the bdi dirty limit. | ||
272 | * Relatively low thresholds will be allocated to heavy dirtiers. So when | ||
273 | * dirty pages grow large, heavy dirtiers will be throttled first, which will | ||
274 | * effectively curb the growth of dirty pages. Light dirtiers with high enough | ||
275 | * dirty threshold may never get throttled. | ||
295 | */ | 276 | */ |
296 | static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) | 277 | static unsigned long task_dirty_limit(struct task_struct *tsk, |
278 | unsigned long bdi_dirty) | ||
297 | { | 279 | { |
298 | long numerator, denominator; | 280 | long numerator, denominator; |
299 | unsigned long dirty = *pdirty; | 281 | unsigned long dirty = bdi_dirty; |
300 | u64 inv = dirty >> 3; | 282 | u64 inv = dirty >> 3; |
301 | 283 | ||
302 | task_dirties_fraction(tsk, &numerator, &denominator); | 284 | task_dirties_fraction(tsk, &numerator, &denominator); |
@@ -304,10 +286,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) | |||
304 | do_div(inv, denominator); | 286 | do_div(inv, denominator); |
305 | 287 | ||
306 | dirty -= inv; | 288 | dirty -= inv; |
307 | if (dirty < *pdirty/2) | ||
308 | dirty = *pdirty/2; | ||
309 | 289 | ||
310 | *pdirty = dirty; | 290 | return max(dirty, bdi_dirty/2); |
311 | } | 291 | } |
312 | 292 | ||
313 | /* | 293 | /* |
@@ -417,9 +397,16 @@ unsigned long determine_dirtyable_memory(void) | |||
417 | return x + 1; /* Ensure that we never return 0 */ | 397 | return x + 1; /* Ensure that we never return 0 */ |
418 | } | 398 | } |
419 | 399 | ||
420 | void | 400 | /** |
421 | get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, | 401 | * global_dirty_limits - background-writeback and dirty-throttling thresholds |
422 | unsigned long *pbdi_dirty, struct backing_dev_info *bdi) | 402 | * |
403 | * Calculate the dirty thresholds based on sysctl parameters | ||
404 | * - vm.dirty_background_ratio or vm.dirty_background_bytes | ||
405 | * - vm.dirty_ratio or vm.dirty_bytes | ||
406 | * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and | ||
407 | * runtime tasks. | ||
408 | */ | ||
409 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | ||
423 | { | 410 | { |
424 | unsigned long background; | 411 | unsigned long background; |
425 | unsigned long dirty; | 412 | unsigned long dirty; |
@@ -451,27 +438,37 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, | |||
451 | } | 438 | } |
452 | *pbackground = background; | 439 | *pbackground = background; |
453 | *pdirty = dirty; | 440 | *pdirty = dirty; |
441 | } | ||
442 | |||
443 | /** | ||
444 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold | ||
445 | * | ||
446 | * Allocate high/low dirty limits to fast/slow devices, in order to prevent | ||
447 | * - starving fast devices | ||
448 | * - piling up dirty pages (that will take long time to sync) on slow devices | ||
449 | * | ||
450 | * The bdi's share of dirty limit will be adapting to its throughput and | ||
451 | * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. | ||
452 | */ | ||
453 | unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | ||
454 | { | ||
455 | u64 bdi_dirty; | ||
456 | long numerator, denominator; | ||
457 | |||
458 | /* | ||
459 | * Calculate this BDI's share of the dirty ratio. | ||
460 | */ | ||
461 | bdi_writeout_fraction(bdi, &numerator, &denominator); | ||
454 | 462 | ||
455 | if (bdi) { | 463 | bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; |
456 | u64 bdi_dirty; | 464 | bdi_dirty *= numerator; |
457 | long numerator, denominator; | 465 | do_div(bdi_dirty, denominator); |
458 | 466 | ||
459 | /* | 467 | bdi_dirty += (dirty * bdi->min_ratio) / 100; |
460 | * Calculate this BDI's share of the dirty ratio. | 468 | if (bdi_dirty > (dirty * bdi->max_ratio) / 100) |
461 | */ | 469 | bdi_dirty = dirty * bdi->max_ratio / 100; |
462 | bdi_writeout_fraction(bdi, &numerator, &denominator); | 470 | |
463 | 471 | return bdi_dirty; | |
464 | bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; | ||
465 | bdi_dirty *= numerator; | ||
466 | do_div(bdi_dirty, denominator); | ||
467 | bdi_dirty += (dirty * bdi->min_ratio) / 100; | ||
468 | if (bdi_dirty > (dirty * bdi->max_ratio) / 100) | ||
469 | bdi_dirty = dirty * bdi->max_ratio / 100; | ||
470 | |||
471 | *pbdi_dirty = bdi_dirty; | ||
472 | clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); | ||
473 | task_dirty_limit(current, pbdi_dirty); | ||
474 | } | ||
475 | } | 472 | } |
476 | 473 | ||
477 | /* | 474 | /* |
@@ -491,7 +488,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
491 | unsigned long bdi_thresh; | 488 | unsigned long bdi_thresh; |
492 | unsigned long pages_written = 0; | 489 | unsigned long pages_written = 0; |
493 | unsigned long pause = 1; | 490 | unsigned long pause = 1; |
494 | 491 | bool dirty_exceeded = false; | |
495 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 492 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
496 | 493 | ||
497 | for (;;) { | 494 | for (;;) { |
@@ -502,18 +499,11 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
502 | .range_cyclic = 1, | 499 | .range_cyclic = 1, |
503 | }; | 500 | }; |
504 | 501 | ||
505 | get_dirty_limits(&background_thresh, &dirty_thresh, | ||
506 | &bdi_thresh, bdi); | ||
507 | |||
508 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 502 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + |
509 | global_page_state(NR_UNSTABLE_NFS); | 503 | global_page_state(NR_UNSTABLE_NFS); |
510 | nr_writeback = global_page_state(NR_WRITEBACK); | 504 | nr_writeback = global_page_state(NR_WRITEBACK); |
511 | 505 | ||
512 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | 506 | global_dirty_limits(&background_thresh, &dirty_thresh); |
513 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); | ||
514 | |||
515 | if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) | ||
516 | break; | ||
517 | 507 | ||
518 | /* | 508 | /* |
519 | * Throttle it only when the background writeback cannot | 509 | * Throttle it only when the background writeback cannot |
@@ -524,26 +514,8 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
524 | (background_thresh + dirty_thresh) / 2) | 514 | (background_thresh + dirty_thresh) / 2) |
525 | break; | 515 | break; |
526 | 516 | ||
527 | if (!bdi->dirty_exceeded) | 517 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
528 | bdi->dirty_exceeded = 1; | 518 | bdi_thresh = task_dirty_limit(current, bdi_thresh); |
529 | |||
530 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | ||
531 | * Unstable writes are a feature of certain networked | ||
532 | * filesystems (i.e. NFS) in which data may have been | ||
533 | * written to the server's write cache, but has not yet | ||
534 | * been flushed to permanent storage. | ||
535 | * Only move pages to writeback if this bdi is over its | ||
536 | * threshold otherwise wait until the disk writes catch | ||
537 | * up. | ||
538 | */ | ||
539 | trace_wbc_balance_dirty_start(&wbc, bdi); | ||
540 | if (bdi_nr_reclaimable > bdi_thresh) { | ||
541 | writeback_inodes_wb(&bdi->wb, &wbc); | ||
542 | pages_written += write_chunk - wbc.nr_to_write; | ||
543 | get_dirty_limits(&background_thresh, &dirty_thresh, | ||
544 | &bdi_thresh, bdi); | ||
545 | trace_wbc_balance_dirty_written(&wbc, bdi); | ||
546 | } | ||
547 | 519 | ||
548 | /* | 520 | /* |
549 | * In order to avoid the stacked BDI deadlock we need | 521 | * In order to avoid the stacked BDI deadlock we need |
@@ -558,16 +530,44 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
558 | if (bdi_thresh < 2*bdi_stat_error(bdi)) { | 530 | if (bdi_thresh < 2*bdi_stat_error(bdi)) { |
559 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | 531 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); |
560 | bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); | 532 | bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); |
561 | } else if (bdi_nr_reclaimable) { | 533 | } else { |
562 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | 534 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
563 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); | 535 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); |
564 | } | 536 | } |
565 | 537 | ||
566 | if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) | 538 | /* |
539 | * The bdi thresh is somehow "soft" limit derived from the | ||
540 | * global "hard" limit. The former helps to prevent heavy IO | ||
541 | * bdi or process from holding back light ones; The latter is | ||
542 | * the last resort safeguard. | ||
543 | */ | ||
544 | dirty_exceeded = | ||
545 | (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh) | ||
546 | || (nr_reclaimable + nr_writeback >= dirty_thresh); | ||
547 | |||
548 | if (!dirty_exceeded) | ||
567 | break; | 549 | break; |
568 | if (pages_written >= write_chunk) | ||
569 | break; /* We've done our duty */ | ||
570 | 550 | ||
551 | if (!bdi->dirty_exceeded) | ||
552 | bdi->dirty_exceeded = 1; | ||
553 | |||
554 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | ||
555 | * Unstable writes are a feature of certain networked | ||
556 | * filesystems (i.e. NFS) in which data may have been | ||
557 | * written to the server's write cache, but has not yet | ||
558 | * been flushed to permanent storage. | ||
559 | * Only move pages to writeback if this bdi is over its | ||
560 | * threshold otherwise wait until the disk writes catch | ||
561 | * up. | ||
562 | */ | ||
563 | trace_wbc_balance_dirty_start(&wbc, bdi); | ||
564 | if (bdi_nr_reclaimable > bdi_thresh) { | ||
565 | writeback_inodes_wb(&bdi->wb, &wbc); | ||
566 | pages_written += write_chunk - wbc.nr_to_write; | ||
567 | trace_wbc_balance_dirty_written(&wbc, bdi); | ||
568 | if (pages_written >= write_chunk) | ||
569 | break; /* We've done our duty */ | ||
570 | } | ||
571 | trace_wbc_balance_dirty_wait(&wbc, bdi); | 571 | trace_wbc_balance_dirty_wait(&wbc, bdi); |
572 | __set_current_state(TASK_INTERRUPTIBLE); | 572 | __set_current_state(TASK_INTERRUPTIBLE); |
573 | io_schedule_timeout(pause); | 573 | io_schedule_timeout(pause); |
@@ -581,8 +581,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
581 | pause = HZ / 10; | 581 | pause = HZ / 10; |
582 | } | 582 | } |
583 | 583 | ||
584 | if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && | 584 | if (!dirty_exceeded && bdi->dirty_exceeded) |
585 | bdi->dirty_exceeded) | ||
586 | bdi->dirty_exceeded = 0; | 585 | bdi->dirty_exceeded = 0; |
587 | 586 | ||
588 | if (writeback_in_progress(bdi)) | 587 | if (writeback_in_progress(bdi)) |
@@ -597,9 +596,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
597 | * background_thresh, to keep the amount of dirty memory low. | 596 | * background_thresh, to keep the amount of dirty memory low. |
598 | */ | 597 | */ |
599 | if ((laptop_mode && pages_written) || | 598 | if ((laptop_mode && pages_written) || |
600 | (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) | 599 | (!laptop_mode && (nr_reclaimable > background_thresh))) |
601 | + global_page_state(NR_UNSTABLE_NFS)) | ||
602 | > background_thresh))) | ||
603 | bdi_start_background_writeback(bdi); | 600 | bdi_start_background_writeback(bdi); |
604 | } | 601 | } |
605 | 602 | ||
@@ -663,7 +660,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
663 | unsigned long dirty_thresh; | 660 | unsigned long dirty_thresh; |
664 | 661 | ||
665 | for ( ; ; ) { | 662 | for ( ; ; ) { |
666 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); | 663 | global_dirty_limits(&background_thresh, &dirty_thresh); |
667 | 664 | ||
668 | /* | 665 | /* |
669 | * Boost the allowable dirty threshold a bit for page | 666 | * Boost the allowable dirty threshold a bit for page |
@@ -825,10 +822,10 @@ void __init page_writeback_init(void) | |||
825 | /* | 822 | /* |
826 | * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. | 823 | * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. |
827 | */ | 824 | */ |
828 | #define WRITEBACK_TAG_BATCH 4096 | ||
829 | void tag_pages_for_writeback(struct address_space *mapping, | 825 | void tag_pages_for_writeback(struct address_space *mapping, |
830 | pgoff_t start, pgoff_t end) | 826 | pgoff_t start, pgoff_t end) |
831 | { | 827 | { |
828 | #define WRITEBACK_TAG_BATCH 4096 | ||
832 | unsigned long tagged; | 829 | unsigned long tagged; |
833 | 830 | ||
834 | do { | 831 | do { |